LXR linux/net/core/dev.c

   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *              This program is free software; you can redistribute it and/or
   5 *              modify it under the terms of the GNU General Public License
   6 *              as published by the Free Software Foundation; either version
   7 *              2 of the License, or (at your option) any later version.
   8 *
   9 *      Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:        Ross Biro
  11 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *      Additional Authors:
  15 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *              David Hinds <dahinds@users.sourceforge.net>
  18 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *              Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *      Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *              Alan Cox        :       device private ioctl copies fields back.
  28 *              Alan Cox        :       Transmit queue code does relevant
  29 *                                      stunts to keep the queue safe.
  30 *              Alan Cox        :       Fixed double lock.
  31 *              Alan Cox        :       Fixed promisc NULL pointer trap
  32 *              ????????        :       Support the full private ioctl range
  33 *              Alan Cox        :       Moved ioctl permission check into
  34 *                                      drivers
  35 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36 *              Alan Cox        :       100 backlog just doesn't cut it when
  37 *                                      you start doing multicast video 8)
  38 *              Alan Cox        :       Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *              Alan Cox        :       Took out transmit every packet pass
  41 *                                      Saved a few bytes in the ioctl handler
  42 *              Alan Cox        :       Network driver sets packet type before
  43 *                                      calling netif_rx. Saves a function
  44 *                                      call a packet.
  45 *              Alan Cox        :       Hashed net_bh()
  46 *              Richard Kooijman:       Timestamp fixes.
  47 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48 *              Alan Cox        :       Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *                                      changes.
  51 *              Rudi Cilibrasi  :       Pass the right thing to
  52 *                                      set_mac_address()
  53 *              Dave Miller     :       32bit quantity for the device lock to
  54 *                                      make it work out on a Sparc.
  55 *              Bjorn Ekwall    :       Added KERNELD hack.
  56 *              Alan Cox        :       Cleaned up the backlog initialise.
  57 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58 *                                      1 device.
  59 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60 *                                      is no device open function.
  61 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63 *              Cyrus Durgin    :       Cleaned for KMOD
  64 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65 *                                      A network device unload needs to purge
  66 *                                      the backlog queue.
  67 *      Paul Rusty Russell      :       SIOCSIFNAME
  68 *              Pekka Riikonen  :       Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *                                      - netif_rx() feedback
  73 */
  74
  75#include <linux/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/sched/mm.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/skbuff.h>
  97#include <linux/bpf.h>
  98#include <linux/bpf_trace.h>
  99#include <net/net_namespace.h>
 100#include <net/sock.h>
 101#include <net/busy_poll.h>
 102#include <linux/rtnetlink.h>
 103#include <linux/stat.h>
 104#include <net/dst.h>
 105#include <net/dst_metadata.h>
 106#include <net/pkt_sched.h>
 107#include <net/pkt_cls.h>
 108#include <net/checksum.h>
 109#include <net/xfrm.h>
 110#include <linux/highmem.h>
 111#include <linux/init.h>
 112#include <linux/module.h>
 113#include <linux/netpoll.h>
 114#include <linux/rcupdate.h>
 115#include <linux/delay.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <net/ip.h>
 126#include <net/mpls.h>
 127#include <linux/ipv6.h>
 128#include <linux/in.h>
 129#include <linux/jhash.h>
 130#include <linux/random.h>
 131#include <trace/events/napi.h>
 132#include <trace/events/net.h>
 133#include <trace/events/skb.h>
 134#include <linux/pci.h>
 135#include <linux/inetdevice.h>
 136#include <linux/cpu_rmap.h>
 137#include <linux/static_key.h>
 138#include <linux/hashtable.h>
 139#include <linux/vmalloc.h>
 140#include <linux/if_macvlan.h>
 141#include <linux/errqueue.h>
 142#include <linux/hrtimer.h>
 143#include <linux/netfilter_ingress.h>
 144#include <linux/crash_dump.h>
 145#include <linux/sctp.h>
 146#include <net/udp_tunnel.h>
 147#include <linux/net_namespace.h>
 148
 149#include "net-sysfs.h"
 150
 151#define MAX_GRO_SKBS 8
 152
 153/* This should be increased if a protocol with a bigger head is added. */
 154#define GRO_MAX_HEAD (MAX_HEADER + 128)
 155
 156static DEFINE_SPINLOCK(ptype_lock);
 157static DEFINE_SPINLOCK(offload_lock);
 158struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 159struct list_head ptype_all __read_mostly;       /* Taps */
 160static struct list_head offload_base __read_mostly;
 161
 162static int netif_rx_internal(struct sk_buff *skb);
 163static int call_netdevice_notifiers_info(unsigned long val,
 164                                         struct netdev_notifier_info *info);
 165static struct napi_struct *napi_by_id(unsigned int napi_id);
 166
 167/*
 168 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 169 * semaphore.
 170 *
 171 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 172 *
 173 * Writers must hold the rtnl semaphore while they loop through the
 174 * dev_base_head list, and hold dev_base_lock for writing when they do the
 175 * actual updates.  This allows pure readers to access the list even
 176 * while a writer is preparing to update it.
 177 *
 178 * To put it another way, dev_base_lock is held for writing only to
 179 * protect against pure readers; the rtnl semaphore provides the
 180 * protection against other writers.
 181 *
 182 * See, for example usages, register_netdevice() and
 183 * unregister_netdevice(), which must be called with the rtnl
 184 * semaphore held.
 185 */
 186DEFINE_RWLOCK(dev_base_lock);
 187EXPORT_SYMBOL(dev_base_lock);
 188
 189static DEFINE_MUTEX(ifalias_mutex);
 190
 191/* protects napi_hash addition/deletion and napi_gen_id */
 192static DEFINE_SPINLOCK(napi_hash_lock);
 193
 194static unsigned int napi_gen_id = NR_CPUS;
 195static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 196
 197static seqcount_t devnet_rename_seq;
 198
 199static inline void dev_base_seq_inc(struct net *net)
 200{
 201        while (++net->dev_base_seq == 0)
 202                ;
 203}
 204
 205static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 206{
 207        unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 208
 209        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 210}
 211
 212static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 213{
 214        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 215}
 216
 217static inline void rps_lock(struct softnet_data *sd)
 218{
 219#ifdef CONFIG_RPS
 220        spin_lock(&sd->input_pkt_queue.lock);
 221#endif
 222}
 223
 224static inline void rps_unlock(struct softnet_data *sd)
 225{
 226#ifdef CONFIG_RPS
 227        spin_unlock(&sd->input_pkt_queue.lock);
 228#endif
 229}
 230
 231/* Device list insertion */
 232static void list_netdevice(struct net_device *dev)
 233{
 234        struct net *net = dev_net(dev);
 235
 236        ASSERT_RTNL();
 237
 238        write_lock_bh(&dev_base_lock);
 239        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 240        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 241        hlist_add_head_rcu(&dev->index_hlist,
 242                           dev_index_hash(net, dev->ifindex));
 243        write_unlock_bh(&dev_base_lock);
 244
 245        dev_base_seq_inc(net);
 246}
 247
 248/* Device list removal
 249 * caller must respect a RCU grace period before freeing/reusing dev
 250 */
 251static void unlist_netdevice(struct net_device *dev)
 252{
 253        ASSERT_RTNL();
 254
 255        /* Unlink dev from the device chain */
 256        write_lock_bh(&dev_base_lock);
 257        list_del_rcu(&dev->dev_list);
 258        hlist_del_rcu(&dev->name_hlist);
 259        hlist_del_rcu(&dev->index_hlist);
 260        write_unlock_bh(&dev_base_lock);
 261
 262        dev_base_seq_inc(dev_net(dev));
 263}
 264
 265/*
 266 *      Our notifier list
 267 */
 268
 269static RAW_NOTIFIER_HEAD(netdev_chain);
 270
 271/*
 272 *      Device drivers call our routines to queue packets here. We empty the
 273 *      queue in the local softnet handler.
 274 */
 275
 276DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 277EXPORT_PER_CPU_SYMBOL(softnet_data);
 278
 279#ifdef CONFIG_LOCKDEP
 280/*
 281 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 282 * according to dev->type
 283 */
 284static const unsigned short netdev_lock_type[] = {
 285         ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 286         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 287         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 288         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 289         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 290         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 291         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 292         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 293         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 294         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 295         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 296         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 297         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 298         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 299         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 300
 301static const char *const netdev_lock_name[] = {
 302        "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 303        "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 304        "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 305        "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 306        "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 307        "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 308        "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 309        "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 310        "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 311        "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 312        "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 313        "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 314        "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 315        "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 316        "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 317
 318static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 319static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 320
 321static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 322{
 323        int i;
 324
 325        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 326                if (netdev_lock_type[i] == dev_type)
 327                        return i;
 328        /* the last key is used by default */
 329        return ARRAY_SIZE(netdev_lock_type) - 1;
 330}
 331
 332static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 333                                                 unsigned short dev_type)
 334{
 335        int i;
 336
 337        i = netdev_lock_pos(dev_type);
 338        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 339                                   netdev_lock_name[i]);
 340}
 341
 342static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 343{
 344        int i;
 345
 346        i = netdev_lock_pos(dev->type);
 347        lockdep_set_class_and_name(&dev->addr_list_lock,
 348                                   &netdev_addr_lock_key[i],
 349                                   netdev_lock_name[i]);
 350}
 351#else
 352static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 353                                                 unsigned short dev_type)
 354{
 355}
 356static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 357{
 358}
 359#endif
 360
 361/*******************************************************************************
 362 *
 363 *              Protocol management and registration routines
 364 *
 365 *******************************************************************************/
 366
 367
 368/*
 369 *      Add a protocol ID to the list. Now that the input handler is
 370 *      smarter we can dispense with all the messy stuff that used to be
 371 *      here.
 372 *
 373 *      BEWARE!!! Protocol handlers, mangling input packets,
 374 *      MUST BE last in hash buckets and checking protocol handlers
 375 *      MUST start from promiscuous ptype_all chain in net_bh.
 376 *      It is true now, do not change it.
 377 *      Explanation follows: if protocol handler, mangling packet, will
 378 *      be the first on list, it is not able to sense, that packet
 379 *      is cloned and should be copied-on-write, so that it will
 380 *      change it and subsequent readers will get broken packet.
 381 *                                                      --ANK (980803)
 382 */
 383
 384static inline struct list_head *ptype_head(const struct packet_type *pt)
 385{
 386        if (pt->type == htons(ETH_P_ALL))
 387                return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 388        else
 389                return pt->dev ? &pt->dev->ptype_specific :
 390                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 391}
 392
 393/**
 394 *      dev_add_pack - add packet handler
 395 *      @pt: packet type declaration
 396 *
 397 *      Add a protocol handler to the networking stack. The passed &packet_type
 398 *      is linked into kernel lists and may not be freed until it has been
 399 *      removed from the kernel lists.
 400 *
 401 *      This call does not sleep therefore it can not
 402 *      guarantee all CPU's that are in middle of receiving packets
 403 *      will see the new packet type (until the next received packet).
 404 */
 405
 406void dev_add_pack(struct packet_type *pt)
 407{
 408        struct list_head *head = ptype_head(pt);
 409
 410        spin_lock(&ptype_lock);
 411        list_add_rcu(&pt->list, head);
 412        spin_unlock(&ptype_lock);
 413}
 414EXPORT_SYMBOL(dev_add_pack);
 415
 416/**
 417 *      __dev_remove_pack        - remove packet handler
 418 *      @pt: packet type declaration
 419 *
 420 *      Remove a protocol handler that was previously added to the kernel
 421 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 422 *      from the kernel lists and can be freed or reused once this function
 423 *      returns.
 424 *
 425 *      The packet type might still be in use by receivers
 426 *      and must not be freed until after all the CPU's have gone
 427 *      through a quiescent state.
 428 */
 429void __dev_remove_pack(struct packet_type *pt)
 430{
 431        struct list_head *head = ptype_head(pt);
 432        struct packet_type *pt1;
 433
 434        spin_lock(&ptype_lock);
 435
 436        list_for_each_entry(pt1, head, list) {
 437                if (pt == pt1) {
 438                        list_del_rcu(&pt->list);
 439                        goto out;
 440                }
 441        }
 442
 443        pr_warn("dev_remove_pack: %p not found\n", pt);
 444out:
 445        spin_unlock(&ptype_lock);
 446}
 447EXPORT_SYMBOL(__dev_remove_pack);
 448
 449/**
 450 *      dev_remove_pack  - remove packet handler
 451 *      @pt: packet type declaration
 452 *
 453 *      Remove a protocol handler that was previously added to the kernel
 454 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 455 *      from the kernel lists and can be freed or reused once this function
 456 *      returns.
 457 *
 458 *      This call sleeps to guarantee that no CPU is looking at the packet
 459 *      type after return.
 460 */
 461void dev_remove_pack(struct packet_type *pt)
 462{
 463        __dev_remove_pack(pt);
 464
 465        synchronize_net();
 466}
 467EXPORT_SYMBOL(dev_remove_pack);
 468
 469
 470/**
 471 *      dev_add_offload - register offload handlers
 472 *      @po: protocol offload declaration
 473 *
 474 *      Add protocol offload handlers to the networking stack. The passed
 475 *      &proto_offload is linked into kernel lists and may not be freed until
 476 *      it has been removed from the kernel lists.
 477 *
 478 *      This call does not sleep therefore it can not
 479 *      guarantee all CPU's that are in middle of receiving packets
 480 *      will see the new offload handlers (until the next received packet).
 481 */
 482void dev_add_offload(struct packet_offload *po)
 483{
 484        struct packet_offload *elem;
 485
 486        spin_lock(&offload_lock);
 487        list_for_each_entry(elem, &offload_base, list) {
 488                if (po->priority < elem->priority)
 489                        break;
 490        }
 491        list_add_rcu(&po->list, elem->list.prev);
 492        spin_unlock(&offload_lock);
 493}
 494EXPORT_SYMBOL(dev_add_offload);
 495
 496/**
 497 *      __dev_remove_offload     - remove offload handler
 498 *      @po: packet offload declaration
 499 *
 500 *      Remove a protocol offload handler that was previously added to the
 501 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 502 *      is removed from the kernel lists and can be freed or reused once this
 503 *      function returns.
 504 *
 505 *      The packet type might still be in use by receivers
 506 *      and must not be freed until after all the CPU's have gone
 507 *      through a quiescent state.
 508 */
 509static void __dev_remove_offload(struct packet_offload *po)
 510{
 511        struct list_head *head = &offload_base;
 512        struct packet_offload *po1;
 513
 514        spin_lock(&offload_lock);
 515
 516        list_for_each_entry(po1, head, list) {
 517                if (po == po1) {
 518                        list_del_rcu(&po->list);
 519                        goto out;
 520                }
 521        }
 522
 523        pr_warn("dev_remove_offload: %p not found\n", po);
 524out:
 525        spin_unlock(&offload_lock);
 526}
 527
 528/**
 529 *      dev_remove_offload       - remove packet offload handler
 530 *      @po: packet offload declaration
 531 *
 532 *      Remove a packet offload handler that was previously added to the kernel
 533 *      offload handlers by dev_add_offload(). The passed &offload_type is
 534 *      removed from the kernel lists and can be freed or reused once this
 535 *      function returns.
 536 *
 537 *      This call sleeps to guarantee that no CPU is looking at the packet
 538 *      type after return.
 539 */
 540void dev_remove_offload(struct packet_offload *po)
 541{
 542        __dev_remove_offload(po);
 543
 544        synchronize_net();
 545}
 546EXPORT_SYMBOL(dev_remove_offload);
 547
 548/******************************************************************************
 549 *
 550 *                    Device Boot-time Settings Routines
 551 *
 552 ******************************************************************************/
 553
 554/* Boot time configuration table */
 555static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 556
 557/**
 558 *      netdev_boot_setup_add   - add new setup entry
 559 *      @name: name of the device
 560 *      @map: configured settings for the device
 561 *
 562 *      Adds new setup entry to the dev_boot_setup list.  The function
 563 *      returns 0 on error and 1 on success.  This is a generic routine to
 564 *      all netdevices.
 565 */
 566static int netdev_boot_setup_add(char *name, struct ifmap *map)
 567{
 568        struct netdev_boot_setup *s;
 569        int i;
 570
 571        s = dev_boot_setup;
 572        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 573                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 574                        memset(s[i].name, 0, sizeof(s[i].name));
 575                        strlcpy(s[i].name, name, IFNAMSIZ);
 576                        memcpy(&s[i].map, map, sizeof(s[i].map));
 577                        break;
 578                }
 579        }
 580
 581        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 582}
 583
 584/**
 585 * netdev_boot_setup_check      - check boot time settings
 586 * @dev: the netdevice
 587 *
 588 * Check boot time settings for the device.
 589 * The found settings are set for the device to be used
 590 * later in the device probing.
 591 * Returns 0 if no settings found, 1 if they are.
 592 */
 593int netdev_boot_setup_check(struct net_device *dev)
 594{
 595        struct netdev_boot_setup *s = dev_boot_setup;
 596        int i;
 597
 598        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 599                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 600                    !strcmp(dev->name, s[i].name)) {
 601                        dev->irq = s[i].map.irq;
 602                        dev->base_addr = s[i].map.base_addr;
 603                        dev->mem_start = s[i].map.mem_start;
 604                        dev->mem_end = s[i].map.mem_end;
 605                        return 1;
 606                }
 607        }
 608        return 0;
 609}
 610EXPORT_SYMBOL(netdev_boot_setup_check);
 611
 612
 613/**
 614 * netdev_boot_base     - get address from boot time settings
 615 * @prefix: prefix for network device
 616 * @unit: id for network device
 617 *
 618 * Check boot time settings for the base address of device.
 619 * The found settings are set for the device to be used
 620 * later in the device probing.
 621 * Returns 0 if no settings found.
 622 */
 623unsigned long netdev_boot_base(const char *prefix, int unit)
 624{
 625        const struct netdev_boot_setup *s = dev_boot_setup;
 626        char name[IFNAMSIZ];
 627        int i;
 628
 629        sprintf(name, "%s%d", prefix, unit);
 630
 631        /*
 632         * If device already registered then return base of 1
 633         * to indicate not to probe for this interface
 634         */
 635        if (__dev_get_by_name(&init_net, name))
 636                return 1;
 637
 638        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 639                if (!strcmp(name, s[i].name))
 640                        return s[i].map.base_addr;
 641        return 0;
 642}
 643
 644/*
 645 * Saves at boot time configured settings for any netdevice.
 646 */
 647int __init netdev_boot_setup(char *str)
 648{
 649        int ints[5];
 650        struct ifmap map;
 651
 652        str = get_options(str, ARRAY_SIZE(ints), ints);
 653        if (!str || !*str)
 654                return 0;
 655
 656        /* Save settings */
 657        memset(&map, 0, sizeof(map));
 658        if (ints[0] > 0)
 659                map.irq = ints[1];
 660        if (ints[0] > 1)
 661                map.base_addr = ints[2];
 662        if (ints[0] > 2)
 663                map.mem_start = ints[3];
 664        if (ints[0] > 3)
 665                map.mem_end = ints[4];
 666
 667        /* Add new entry to the list */
 668        return netdev_boot_setup_add(str, &map);
 669}
 670
 671__setup("netdev=", netdev_boot_setup);
 672
 673/*******************************************************************************
 674 *
 675 *                          Device Interface Subroutines
 676 *
 677 *******************************************************************************/
 678
 679/**
 680 *      dev_get_iflink  - get 'iflink' value of a interface
 681 *      @dev: targeted interface
 682 *
 683 *      Indicates the ifindex the interface is linked to.
 684 *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 685 */
 686
 687int dev_get_iflink(const struct net_device *dev)
 688{
 689        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 690                return dev->netdev_ops->ndo_get_iflink(dev);
 691
 692        return dev->ifindex;
 693}
 694EXPORT_SYMBOL(dev_get_iflink);
 695
 696/**
 697 *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 698 *      @dev: targeted interface
 699 *      @skb: The packet.
 700 *
 701 *      For better visibility of tunnel traffic OVS needs to retrieve
 702 *      egress tunnel information for a packet. Following API allows
 703 *      user to get this info.
 704 */
 705int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 706{
 707        struct ip_tunnel_info *info;
 708
 709        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 710                return -EINVAL;
 711
 712        info = skb_tunnel_info_unclone(skb);
 713        if (!info)
 714                return -ENOMEM;
 715        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 716                return -EINVAL;
 717
 718        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 719}
 720EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 721
 722/**
 723 *      __dev_get_by_name       - find a device by its name
 724 *      @net: the applicable net namespace
 725 *      @name: name to find
 726 *
 727 *      Find an interface by name. Must be called under RTNL semaphore
 728 *      or @dev_base_lock. If the name is found a pointer to the device
 729 *      is returned. If the name is not found then %NULL is returned. The
 730 *      reference counters are not incremented so the caller must be
 731 *      careful with locks.
 732 */
 733
 734struct net_device *__dev_get_by_name(struct net *net, const char *name)
 735{
 736        struct net_device *dev;
 737        struct hlist_head *head = dev_name_hash(net, name);
 738
 739        hlist_for_each_entry(dev, head, name_hlist)
 740                if (!strncmp(dev->name, name, IFNAMSIZ))
 741                        return dev;
 742
 743        return NULL;
 744}
 745EXPORT_SYMBOL(__dev_get_by_name);
 746
 747/**
 748 * dev_get_by_name_rcu  - find a device by its name
 749 * @net: the applicable net namespace
 750 * @name: name to find
 751 *
 752 * Find an interface by name.
 753 * If the name is found a pointer to the device is returned.
 754 * If the name is not found then %NULL is returned.
 755 * The reference counters are not incremented so the caller must be
 756 * careful with locks. The caller must hold RCU lock.
 757 */
 758
 759struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 760{
 761        struct net_device *dev;
 762        struct hlist_head *head = dev_name_hash(net, name);
 763
 764        hlist_for_each_entry_rcu(dev, head, name_hlist)
 765                if (!strncmp(dev->name, name, IFNAMSIZ))
 766                        return dev;
 767
 768        return NULL;
 769}
 770EXPORT_SYMBOL(dev_get_by_name_rcu);
 771
 772/**
 773 *      dev_get_by_name         - find a device by its name
 774 *      @net: the applicable net namespace
 775 *      @name: name to find
 776 *
 777 *      Find an interface by name. This can be called from any
 778 *      context and does its own locking. The returned handle has
 779 *      the usage count incremented and the caller must use dev_put() to
 780 *      release it when it is no longer needed. %NULL is returned if no
 781 *      matching device is found.
 782 */
 783
 784struct net_device *dev_get_by_name(struct net *net, const char *name)
 785{
 786        struct net_device *dev;
 787
 788        rcu_read_lock();
 789        dev = dev_get_by_name_rcu(net, name);
 790        if (dev)
 791                dev_hold(dev);
 792        rcu_read_unlock();
 793        return dev;
 794}
 795EXPORT_SYMBOL(dev_get_by_name);
 796
 797/**
 798 *      __dev_get_by_index - find a device by its ifindex
 799 *      @net: the applicable net namespace
 800 *      @ifindex: index of device
 801 *
 802 *      Search for an interface by index. Returns %NULL if the device
 803 *      is not found or a pointer to the device. The device has not
 804 *      had its reference counter increased so the caller must be careful
 805 *      about locking. The caller must hold either the RTNL semaphore
 806 *      or @dev_base_lock.
 807 */
 808
 809struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 810{
 811        struct net_device *dev;
 812        struct hlist_head *head = dev_index_hash(net, ifindex);
 813
 814        hlist_for_each_entry(dev, head, index_hlist)
 815                if (dev->ifindex == ifindex)
 816                        return dev;
 817
 818        return NULL;
 819}
 820EXPORT_SYMBOL(__dev_get_by_index);
 821
 822/**
 823 *      dev_get_by_index_rcu - find a device by its ifindex
 824 *      @net: the applicable net namespace
 825 *      @ifindex: index of device
 826 *
 827 *      Search for an interface by index. Returns %NULL if the device
 828 *      is not found or a pointer to the device. The device has not
 829 *      had its reference counter increased so the caller must be careful
 830 *      about locking. The caller must hold RCU lock.
 831 */
 832
 833struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 834{
 835        struct net_device *dev;
 836        struct hlist_head *head = dev_index_hash(net, ifindex);
 837
 838        hlist_for_each_entry_rcu(dev, head, index_hlist)
 839                if (dev->ifindex == ifindex)
 840                        return dev;
 841
 842        return NULL;
 843}
 844EXPORT_SYMBOL(dev_get_by_index_rcu);
 845
 846
 847/**
 848 *      dev_get_by_index - find a device by its ifindex
 849 *      @net: the applicable net namespace
 850 *      @ifindex: index of device
 851 *
 852 *      Search for an interface by index. Returns NULL if the device
 853 *      is not found or a pointer to the device. The device returned has
 854 *      had a reference added and the pointer is safe until the user calls
 855 *      dev_put to indicate they have finished with it.
 856 */
 857
 858struct net_device *dev_get_by_index(struct net *net, int ifindex)
 859{
 860        struct net_device *dev;
 861
 862        rcu_read_lock();
 863        dev = dev_get_by_index_rcu(net, ifindex);
 864        if (dev)
 865                dev_hold(dev);
 866        rcu_read_unlock();
 867        return dev;
 868}
 869EXPORT_SYMBOL(dev_get_by_index);
 870
 871/**
 872 *      dev_get_by_napi_id - find a device by napi_id
 873 *      @napi_id: ID of the NAPI struct
 874 *
 875 *      Search for an interface by NAPI ID. Returns %NULL if the device
 876 *      is not found or a pointer to the device. The device has not had
 877 *      its reference counter increased so the caller must be careful
 878 *      about locking. The caller must hold RCU lock.
 879 */
 880
 881struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 882{
 883        struct napi_struct *napi;
 884
 885        WARN_ON_ONCE(!rcu_read_lock_held());
 886
 887        if (napi_id < MIN_NAPI_ID)
 888                return NULL;
 889
 890        napi = napi_by_id(napi_id);
 891
 892        return napi ? napi->dev : NULL;
 893}
 894EXPORT_SYMBOL(dev_get_by_napi_id);
 895
 896/**
 897 *      netdev_get_name - get a netdevice name, knowing its ifindex.
 898 *      @net: network namespace
 899 *      @name: a pointer to the buffer where the name will be stored.
 900 *      @ifindex: the ifindex of the interface to get the name from.
 901 *
 902 *      The use of raw_seqcount_begin() and cond_resched() before
 903 *      retrying is required as we want to give the writers a chance
 904 *      to complete when CONFIG_PREEMPT is not set.
 905 */
 906int netdev_get_name(struct net *net, char *name, int ifindex)
 907{
 908        struct net_device *dev;
 909        unsigned int seq;
 910
 911retry:
 912        seq = raw_seqcount_begin(&devnet_rename_seq);
 913        rcu_read_lock();
 914        dev = dev_get_by_index_rcu(net, ifindex);
 915        if (!dev) {
 916                rcu_read_unlock();
 917                return -ENODEV;
 918        }
 919
 920        strcpy(name, dev->name);
 921        rcu_read_unlock();
 922        if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 923                cond_resched();
 924                goto retry;
 925        }
 926
 927        return 0;
 928}
 929
 930/**
 931 *      dev_getbyhwaddr_rcu - find a device by its hardware address
 932 *      @net: the applicable net namespace
 933 *      @type: media type of device
 934 *      @ha: hardware address
 935 *
 936 *      Search for an interface by MAC address. Returns NULL if the device
 937 *      is not found or a pointer to the device.
 938 *      The caller must hold RCU or RTNL.
 939 *      The returned device has not had its ref count increased
 940 *      and the caller must therefore be careful about locking
 941 *
 942 */
 943
 944struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 945                                       const char *ha)
 946{
 947        struct net_device *dev;
 948
 949        for_each_netdev_rcu(net, dev)
 950                if (dev->type == type &&
 951                    !memcmp(dev->dev_addr, ha, dev->addr_len))
 952                        return dev;
 953
 954        return NULL;
 955}
 956EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 957
 958struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 959{
 960        struct net_device *dev;
 961
 962        ASSERT_RTNL();
 963        for_each_netdev(net, dev)
 964                if (dev->type == type)
 965                        return dev;
 966
 967        return NULL;
 968}
 969EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 970
 971struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 972{
 973        struct net_device *dev, *ret = NULL;
 974
 975        rcu_read_lock();
 976        for_each_netdev_rcu(net, dev)
 977                if (dev->type == type) {
 978                        dev_hold(dev);
 979                        ret = dev;
 980                        break;
 981                }
 982        rcu_read_unlock();
 983        return ret;
 984}
 985EXPORT_SYMBOL(dev_getfirstbyhwtype);
 986
 987/**
 988 *      __dev_get_by_flags - find any device with given flags
 989 *      @net: the applicable net namespace
 990 *      @if_flags: IFF_* values
 991 *      @mask: bitmask of bits in if_flags to check
 992 *
 993 *      Search for any interface with the given flags. Returns NULL if a device
 994 *      is not found or a pointer to the device. Must be called inside
 995 *      rtnl_lock(), and result refcount is unchanged.
 996 */
 997
 998struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 999                                      unsigned short mask)
1000{

1001        struct net_device *dev, *ret;
1002
1003        ASSERT_RTNL();
1004
1005        ret = NULL;
1006        for_each_netdev(net, dev) {
1007                if (((dev->flags ^ if_flags) & mask) == 0) {
1008                        ret = dev;
1009                        break;
1010                }
1011        }
1012        return ret;
1013}
1014EXPORT_SYMBOL(__dev_get_by_flags);
1015
1016/**
1017 *      dev_valid_name - check if name is okay for network device
1018 *      @name: name string
1019 *
1020 *      Network device names need to be valid file names to
1021 *      to allow sysfs to work.  We also disallow any kind of
1022 *      whitespace.
1023 */
1024bool dev_valid_name(const char *name)
1025{
1026        if (*name == '\0')
1027                return false;
1028        if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1029                return false;
1030        if (!strcmp(name, ".") || !strcmp(name, ".."))
1031                return false;
1032
1033        while (*name) {
1034                if (*name == '/' || *name == ':' || isspace(*name))
1035                        return false;
1036                name++;
1037        }
1038        return true;
1039}
1040EXPORT_SYMBOL(dev_valid_name);
1041
1042/**
1043 *      __dev_alloc_name - allocate a name for a device
1044 *      @net: network namespace to allocate the device name in
1045 *      @name: name format string
1046 *      @buf:  scratch buffer and result name string
1047 *
1048 *      Passed a format string - eg "lt%d" it will try and find a suitable
1049 *      id. It scans list of devices to build up a free map, then chooses
1050 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1051 *      while allocating the name and adding the device in order to avoid
1052 *      duplicates.
1053 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1054 *      Returns the number of the unit assigned or a negative errno code.
1055 */
1056
1057static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1058{
1059        int i = 0;
1060        const char *p;
1061        const int max_netdevices = 8*PAGE_SIZE;
1062        unsigned long *inuse;
1063        struct net_device *d;
1064
1065        if (!dev_valid_name(name))
1066                return -EINVAL;
1067
1068        p = strchr(name, '%');
1069        if (p) {
1070                /*
1071                 * Verify the string as this thing may have come from
1072                 * the user.  There must be either one "%d" and no other "%"
1073                 * characters.
1074                 */
1075                if (p[1] != 'd' || strchr(p + 2, '%'))
1076                        return -EINVAL;
1077
1078                /* Use one page as a bit array of possible slots */
1079                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1080                if (!inuse)
1081                        return -ENOMEM;
1082
1083                for_each_netdev(net, d) {
1084                        if (!sscanf(d->name, name, &i))
1085                                continue;
1086                        if (i < 0 || i >= max_netdevices)
1087                                continue;
1088
1089                        /*  avoid cases where sscanf is not exact inverse of printf */
1090                        snprintf(buf, IFNAMSIZ, name, i);
1091                        if (!strncmp(buf, d->name, IFNAMSIZ))
1092                                set_bit(i, inuse);
1093                }
1094
1095                i = find_first_zero_bit(inuse, max_netdevices);
1096                free_page((unsigned long) inuse);
1097        }
1098
1099        snprintf(buf, IFNAMSIZ, name, i);
1100        if (!__dev_get_by_name(net, buf))
1101                return i;
1102
1103        /* It is possible to run out of possible slots
1104         * when the name is long and there isn't enough space left
1105         * for the digits, or if all bits are used.
1106         */
1107        return -ENFILE;
1108}
1109
1110static int dev_alloc_name_ns(struct net *net,
1111                             struct net_device *dev,
1112                             const char *name)
1113{
1114        char buf[IFNAMSIZ];
1115        int ret;
1116
1117        BUG_ON(!net);
1118        ret = __dev_alloc_name(net, name, buf);
1119        if (ret >= 0)
1120                strlcpy(dev->name, buf, IFNAMSIZ);
1121        return ret;
1122}
1123
1124/**
1125 *      dev_alloc_name - allocate a name for a device
1126 *      @dev: device
1127 *      @name: name format string
1128 *
1129 *      Passed a format string - eg "lt%d" it will try and find a suitable
1130 *      id. It scans list of devices to build up a free map, then chooses
1131 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1132 *      while allocating the name and adding the device in order to avoid
1133 *      duplicates.
1134 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1135 *      Returns the number of the unit assigned or a negative errno code.
1136 */
1137
1138int dev_alloc_name(struct net_device *dev, const char *name)
1139{
1140        return dev_alloc_name_ns(dev_net(dev), dev, name);
1141}
1142EXPORT_SYMBOL(dev_alloc_name);
1143
1144int dev_get_valid_name(struct net *net, struct net_device *dev,
1145                       const char *name)
1146{
1147        BUG_ON(!net);
1148
1149        if (!dev_valid_name(name))
1150                return -EINVAL;
1151
1152        if (strchr(name, '%'))
1153                return dev_alloc_name_ns(net, dev, name);
1154        else if (__dev_get_by_name(net, name))
1155                return -EEXIST;
1156        else if (dev->name != name)
1157                strlcpy(dev->name, name, IFNAMSIZ);
1158
1159        return 0;
1160}
1161EXPORT_SYMBOL(dev_get_valid_name);
1162
1163/**
1164 *      dev_change_name - change name of a device
1165 *      @dev: device
1166 *      @newname: name (or format string) must be at least IFNAMSIZ
1167 *
1168 *      Change name of a device, can pass format strings "eth%d".
1169 *      for wildcarding.
1170 */
1171int dev_change_name(struct net_device *dev, const char *newname)
1172{
1173        unsigned char old_assign_type;
1174        char oldname[IFNAMSIZ];
1175        int err = 0;
1176        int ret;
1177        struct net *net;
1178
1179        ASSERT_RTNL();
1180        BUG_ON(!dev_net(dev));
1181
1182        net = dev_net(dev);
1183        if (dev->flags & IFF_UP)
1184                return -EBUSY;
1185
1186        write_seqcount_begin(&devnet_rename_seq);
1187
1188        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1189                write_seqcount_end(&devnet_rename_seq);
1190                return 0;
1191        }
1192
1193        memcpy(oldname, dev->name, IFNAMSIZ);
1194
1195        err = dev_get_valid_name(net, dev, newname);
1196        if (err < 0) {
1197                write_seqcount_end(&devnet_rename_seq);
1198                return err;
1199        }
1200
1201        if (oldname[0] && !strchr(oldname, '%'))
1202                netdev_info(dev, "renamed from %s\n", oldname);
1203
1204        old_assign_type = dev->name_assign_type;
1205        dev->name_assign_type = NET_NAME_RENAMED;
1206
1207rollback:
1208        ret = device_rename(&dev->dev, dev->name);
1209        if (ret) {
1210                memcpy(dev->name, oldname, IFNAMSIZ);
1211                dev->name_assign_type = old_assign_type;
1212                write_seqcount_end(&devnet_rename_seq);
1213                return ret;
1214        }
1215
1216        write_seqcount_end(&devnet_rename_seq);
1217
1218        netdev_adjacent_rename_links(dev, oldname);
1219
1220        write_lock_bh(&dev_base_lock);
1221        hlist_del_rcu(&dev->name_hlist);
1222        write_unlock_bh(&dev_base_lock);
1223
1224        synchronize_rcu();
1225
1226        write_lock_bh(&dev_base_lock);
1227        hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1228        write_unlock_bh(&dev_base_lock);
1229
1230        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1231        ret = notifier_to_errno(ret);
1232
1233        if (ret) {
1234                /* err >= 0 after dev_alloc_name() or stores the first errno */
1235                if (err >= 0) {
1236                        err = ret;
1237                        write_seqcount_begin(&devnet_rename_seq);
1238                        memcpy(dev->name, oldname, IFNAMSIZ);
1239                        memcpy(oldname, newname, IFNAMSIZ);
1240                        dev->name_assign_type = old_assign_type;
1241                        old_assign_type = NET_NAME_RENAMED;
1242                        goto rollback;
1243                } else {
1244                        pr_err("%s: name change rollback failed: %d\n",
1245                               dev->name, ret);
1246                }
1247        }
1248
1249        return err;
1250}
1251
1252/**
1253 *      dev_set_alias - change ifalias of a device
1254 *      @dev: device
1255 *      @alias: name up to IFALIASZ
1256 *      @len: limit of bytes to copy from info
1257 *
1258 *      Set ifalias for a device,
1259 */
1260int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1261{
1262        struct dev_ifalias *new_alias = NULL;
1263
1264        if (len >= IFALIASZ)
1265                return -EINVAL;
1266
1267        if (len) {
1268                new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1269                if (!new_alias)
1270                        return -ENOMEM;
1271
1272                memcpy(new_alias->ifalias, alias, len);
1273                new_alias->ifalias[len] = 0;
1274        }
1275
1276        mutex_lock(&ifalias_mutex);
1277        rcu_swap_protected(dev->ifalias, new_alias,
1278                           mutex_is_locked(&ifalias_mutex));
1279        mutex_unlock(&ifalias_mutex);
1280
1281        if (new_alias)
1282                kfree_rcu(new_alias, rcuhead);
1283
1284        return len;
1285}
1286EXPORT_SYMBOL(dev_set_alias);
1287
1288/**
1289 *      dev_get_alias - get ifalias of a device
1290 *      @dev: device
1291 *      @name: buffer to store name of ifalias
1292 *      @len: size of buffer
1293 *
1294 *      get ifalias for a device.  Caller must make sure dev cannot go
1295 *      away,  e.g. rcu read lock or own a reference count to device.
1296 */
1297int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1298{
1299        const struct dev_ifalias *alias;
1300        int ret = 0;
1301
1302        rcu_read_lock();
1303        alias = rcu_dereference(dev->ifalias);
1304        if (alias)
1305                ret = snprintf(name, len, "%s", alias->ifalias);
1306        rcu_read_unlock();
1307
1308        return ret;
1309}
1310
1311/**
1312 *      netdev_features_change - device changes features
1313 *      @dev: device to cause notification
1314 *
1315 *      Called to indicate a device has changed features.
1316 */
1317void netdev_features_change(struct net_device *dev)
1318{
1319        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1320}
1321EXPORT_SYMBOL(netdev_features_change);
1322
1323/**
1324 *      netdev_state_change - device changes state
1325 *      @dev: device to cause notification
1326 *
1327 *      Called to indicate a device has changed state. This function calls
1328 *      the notifier chains for netdev_chain and sends a NEWLINK message
1329 *      to the routing socket.
1330 */
1331void netdev_state_change(struct net_device *dev)
1332{
1333        if (dev->flags & IFF_UP) {
1334                struct netdev_notifier_change_info change_info = {
1335                        .info.dev = dev,
1336                };
1337
1338                call_netdevice_notifiers_info(NETDEV_CHANGE,
1339                                              &change_info.info);
1340                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1341        }
1342}
1343EXPORT_SYMBOL(netdev_state_change);
1344
1345/**
1346 * netdev_notify_peers - notify network peers about existence of @dev
1347 * @dev: network device
1348 *
1349 * Generate traffic such that interested network peers are aware of
1350 * @dev, such as by generating a gratuitous ARP. This may be used when
1351 * a device wants to inform the rest of the network about some sort of
1352 * reconfiguration such as a failover event or virtual machine
1353 * migration.
1354 */
1355void netdev_notify_peers(struct net_device *dev)
1356{
1357        rtnl_lock();
1358        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1359        call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1360        rtnl_unlock();
1361}
1362EXPORT_SYMBOL(netdev_notify_peers);
1363
1364static int __dev_open(struct net_device *dev)
1365{
1366        const struct net_device_ops *ops = dev->netdev_ops;
1367        int ret;
1368
1369        ASSERT_RTNL();
1370
1371        if (!netif_device_present(dev))
1372                return -ENODEV;
1373
1374        /* Block netpoll from trying to do any rx path servicing.
1375         * If we don't do this there is a chance ndo_poll_controller
1376         * or ndo_poll may be running while we open the device
1377         */
1378        netpoll_poll_disable(dev);
1379
1380        ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1381        ret = notifier_to_errno(ret);
1382        if (ret)
1383                return ret;
1384
1385        set_bit(__LINK_STATE_START, &dev->state);
1386
1387        if (ops->ndo_validate_addr)
1388                ret = ops->ndo_validate_addr(dev);
1389
1390        if (!ret && ops->ndo_open)
1391                ret = ops->ndo_open(dev);
1392
1393        netpoll_poll_enable(dev);
1394
1395        if (ret)
1396                clear_bit(__LINK_STATE_START, &dev->state);
1397        else {
1398                dev->flags |= IFF_UP;
1399                dev_set_rx_mode(dev);
1400                dev_activate(dev);
1401                add_device_randomness(dev->dev_addr, dev->addr_len);
1402        }
1403
1404        return ret;
1405}
1406
1407/**
1408 *      dev_open        - prepare an interface for use.
1409 *      @dev:   device to open
1410 *
1411 *      Takes a device from down to up state. The device's private open
1412 *      function is invoked and then the multicast lists are loaded. Finally
1413 *      the device is moved into the up state and a %NETDEV_UP message is
1414 *      sent to the netdev notifier chain.
1415 *
1416 *      Calling this function on an active interface is a nop. On a failure
1417 *      a negative errno code is returned.
1418 */
1419int dev_open(struct net_device *dev)
1420{
1421        int ret;
1422
1423        if (dev->flags & IFF_UP)
1424                return 0;
1425
1426        ret = __dev_open(dev);
1427        if (ret < 0)
1428                return ret;
1429
1430        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1431        call_netdevice_notifiers(NETDEV_UP, dev);
1432
1433        return ret;
1434}
1435EXPORT_SYMBOL(dev_open);
1436
1437static void __dev_close_many(struct list_head *head)
1438{
1439        struct net_device *dev;
1440
1441        ASSERT_RTNL();
1442        might_sleep();
1443
1444        list_for_each_entry(dev, head, close_list) {
1445                /* Temporarily disable netpoll until the interface is down */
1446                netpoll_poll_disable(dev);
1447
1448                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1449
1450                clear_bit(__LINK_STATE_START, &dev->state);
1451
1452                /* Synchronize to scheduled poll. We cannot touch poll list, it
1453                 * can be even on different cpu. So just clear netif_running().
1454                 *
1455                 * dev->stop() will invoke napi_disable() on all of it's
1456                 * napi_struct instances on this device.
1457                 */
1458                smp_mb__after_atomic(); /* Commit netif_running(). */
1459        }
1460
1461        dev_deactivate_many(head);
1462
1463        list_for_each_entry(dev, head, close_list) {
1464                const struct net_device_ops *ops = dev->netdev_ops;
1465
1466                /*
1467                 *      Call the device specific close. This cannot fail.
1468                 *      Only if device is UP
1469                 *
1470                 *      We allow it to be called even after a DETACH hot-plug
1471                 *      event.
1472                 */
1473                if (ops->ndo_stop)
1474                        ops->ndo_stop(dev);
1475
1476                dev->flags &= ~IFF_UP;
1477                netpoll_poll_enable(dev);
1478        }
1479}
1480
1481static void __dev_close(struct net_device *dev)
1482{
1483        LIST_HEAD(single);
1484
1485        list_add(&dev->close_list, &single);
1486        __dev_close_many(&single);
1487        list_del(&single);
1488}
1489
1490void dev_close_many(struct list_head *head, bool unlink)
1491{
1492        struct net_device *dev, *tmp;
1493
1494        /* Remove the devices that don't need to be closed */
1495        list_for_each_entry_safe(dev, tmp, head, close_list)
1496                if (!(dev->flags & IFF_UP))
1497                        list_del_init(&dev->close_list);
1498
1499        __dev_close_many(head);
1500
1501        list_for_each_entry_safe(dev, tmp, head, close_list) {
1502                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1503                call_netdevice_notifiers(NETDEV_DOWN, dev);
1504                if (unlink)
1505                        list_del_init(&dev->close_list);
1506        }
1507}
1508EXPORT_SYMBOL(dev_close_many);
1509
1510/**
1511 *      dev_close - shutdown an interface.
1512 *      @dev: device to shutdown
1513 *
1514 *      This function moves an active device into down state. A
1515 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1516 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1517 *      chain.
1518 */
1519void dev_close(struct net_device *dev)
1520{
1521        if (dev->flags & IFF_UP) {
1522                LIST_HEAD(single);
1523
1524                list_add(&dev->close_list, &single);
1525                dev_close_many(&single, true);
1526                list_del(&single);
1527        }
1528}
1529EXPORT_SYMBOL(dev_close);
1530
1531
1532/**
1533 *      dev_disable_lro - disable Large Receive Offload on a device
1534 *      @dev: device
1535 *
1536 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1537 *      called under RTNL.  This is needed if received packets may be
1538 *      forwarded to another interface.
1539 */
1540void dev_disable_lro(struct net_device *dev)
1541{
1542        struct net_device *lower_dev;
1543        struct list_head *iter;
1544
1545        dev->wanted_features &= ~NETIF_F_LRO;
1546        netdev_update_features(dev);
1547
1548        if (unlikely(dev->features & NETIF_F_LRO))
1549                netdev_WARN(dev, "failed to disable LRO!\n");
1550
1551        netdev_for_each_lower_dev(dev, lower_dev, iter)
1552                dev_disable_lro(lower_dev);
1553}
1554EXPORT_SYMBOL(dev_disable_lro);
1555
1556/**
1557 *      dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1558 *      @dev: device
1559 *
1560 *      Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1561 *      called under RTNL.  This is needed if Generic XDP is installed on
1562 *      the device.
1563 */
1564static void dev_disable_gro_hw(struct net_device *dev)
1565{
1566        dev->wanted_features &= ~NETIF_F_GRO_HW;
1567        netdev_update_features(dev);
1568
1569        if (unlikely(dev->features & NETIF_F_GRO_HW))
1570                netdev_WARN(dev, "failed to disable GRO_HW!\n");
1571}
1572
1573const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1574{
1575#define N(val)                                          \
1576        case NETDEV_##val:                              \
1577                return "NETDEV_" __stringify(val);
1578        switch (cmd) {
1579        N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1580        N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1581        N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1582        N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1583        N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1584        N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1585        N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1586        N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1587        N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1588        }
1589#undef N
1590        return "UNKNOWN_NETDEV_EVENT";
1591}
1592EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1593
1594static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1595                                   struct net_device *dev)
1596{
1597        struct netdev_notifier_info info = {
1598                .dev = dev,
1599        };
1600
1601        return nb->notifier_call(nb, val, &info);
1602}
1603
1604static int dev_boot_phase = 1;
1605
1606/**
1607 * register_netdevice_notifier - register a network notifier block
1608 * @nb: notifier
1609 *
1610 * Register a notifier to be called when network device events occur.
1611 * The notifier passed is linked into the kernel structures and must
1612 * not be reused until it has been unregistered. A negative errno code
1613 * is returned on a failure.
1614 *
1615 * When registered all registration and up events are replayed
1616 * to the new notifier to allow device to have a race free
1617 * view of the network device list.
1618 */
1619
1620int register_netdevice_notifier(struct notifier_block *nb)
1621{
1622        struct net_device *dev;
1623        struct net_device *last;
1624        struct net *net;
1625        int err;
1626
1627        /* Close race with setup_net() and cleanup_net() */
1628        down_write(&pernet_ops_rwsem);
1629        rtnl_lock();
1630        err = raw_notifier_chain_register(&netdev_chain, nb);
1631        if (err)
1632                goto unlock;
1633        if (dev_boot_phase)
1634                goto unlock;
1635        for_each_net(net) {
1636                for_each_netdev(net, dev) {
1637                        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1638                        err = notifier_to_errno(err);
1639                        if (err)
1640                                goto rollback;
1641
1642                        if (!(dev->flags & IFF_UP))
1643                                continue;
1644
1645                        call_netdevice_notifier(nb, NETDEV_UP, dev);
1646                }
1647        }
1648
1649unlock:
1650        rtnl_unlock();
1651        up_write(&pernet_ops_rwsem);
1652        return err;
1653
1654rollback:
1655        last = dev;
1656        for_each_net(net) {
1657                for_each_netdev(net, dev) {
1658                        if (dev == last)
1659                                goto outroll;
1660
1661                        if (dev->flags & IFF_UP) {
1662                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1663                                                        dev);
1664                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1665                        }
1666                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1667                }
1668        }
1669
1670outroll:
1671        raw_notifier_chain_unregister(&netdev_chain, nb);
1672        goto unlock;
1673}
1674EXPORT_SYMBOL(register_netdevice_notifier);
1675
1676/**
1677 * unregister_netdevice_notifier - unregister a network notifier block
1678 * @nb: notifier
1679 *
1680 * Unregister a notifier previously registered by
1681 * register_netdevice_notifier(). The notifier is unlinked into the
1682 * kernel structures and may then be reused. A negative errno code
1683 * is returned on a failure.
1684 *
1685 * After unregistering unregister and down device events are synthesized
1686 * for all devices on the device list to the removed notifier to remove
1687 * the need for special case cleanup code.
1688 */
1689
1690int unregister_netdevice_notifier(struct notifier_block *nb)
1691{
1692        struct net_device *dev;
1693        struct net *net;
1694        int err;
1695
1696        /* Close race with setup_net() and cleanup_net() */
1697        down_write(&pernet_ops_rwsem);
1698        rtnl_lock();
1699        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1700        if (err)
1701                goto unlock;
1702
1703        for_each_net(net) {
1704                for_each_netdev(net, dev) {
1705                        if (dev->flags & IFF_UP) {
1706                                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1707                                                        dev);
1708                                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1709                        }
1710                        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1711                }
1712        }
1713unlock:
1714        rtnl_unlock();
1715        up_write(&pernet_ops_rwsem);
1716        return err;
1717}
1718EXPORT_SYMBOL(unregister_netdevice_notifier);
1719
1720/**
1721 *      call_netdevice_notifiers_info - call all network notifier blocks
1722 *      @val: value passed unmodified to notifier function
1723 *      @info: notifier information data
1724 *
1725 *      Call all network notifier blocks.  Parameters and return value
1726 *      are as for raw_notifier_call_chain().
1727 */
1728
1729static int call_netdevice_notifiers_info(unsigned long val,
1730                                         struct netdev_notifier_info *info)
1731{
1732        ASSERT_RTNL();
1733        return raw_notifier_call_chain(&netdev_chain, val, info);
1734}
1735
1736/**
1737 *      call_netdevice_notifiers - call all network notifier blocks
1738 *      @val: value passed unmodified to notifier function
1739 *      @dev: net_device pointer passed unmodified to notifier function
1740 *
1741 *      Call all network notifier blocks.  Parameters and return value
1742 *      are as for raw_notifier_call_chain().
1743 */
1744
1745int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1746{
1747        struct netdev_notifier_info info = {
1748                .dev = dev,
1749        };
1750
1751        return call_netdevice_notifiers_info(val, &info);
1752}
1753EXPORT_SYMBOL(call_netdevice_notifiers);
1754
1755/**
1756 *      call_netdevice_notifiers_mtu - call all network notifier blocks
1757 *      @val: value passed unmodified to notifier function
1758 *      @dev: net_device pointer passed unmodified to notifier function
1759 *      @arg: additional u32 argument passed to the notifier function
1760 *
1761 *      Call all network notifier blocks.  Parameters and return value
1762 *      are as for raw_notifier_call_chain().
1763 */
1764static int call_netdevice_notifiers_mtu(unsigned long val,
1765                                        struct net_device *dev, u32 arg)
1766{
1767        struct netdev_notifier_info_ext info = {
1768                .info.dev = dev,
1769                .ext.mtu = arg,
1770        };
1771
1772        BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
1773
1774        return call_netdevice_notifiers_info(val, &info.info);
1775}
1776
1777#ifdef CONFIG_NET_INGRESS
1778static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
1779
1780void net_inc_ingress_queue(void)
1781{
1782        static_branch_inc(&ingress_needed_key);
1783}
1784EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1785
1786void net_dec_ingress_queue(void)
1787{
1788        static_branch_dec(&ingress_needed_key);
1789}
1790EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1791#endif
1792
1793#ifdef CONFIG_NET_EGRESS
1794static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
1795
1796void net_inc_egress_queue(void)
1797{
1798        static_branch_inc(&egress_needed_key);
1799}
1800EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1801
1802void net_dec_egress_queue(void)
1803{
1804        static_branch_dec(&egress_needed_key);
1805}
1806EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1807#endif
1808
1809static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
1810#ifdef HAVE_JUMP_LABEL
1811static atomic_t netstamp_needed_deferred;
1812static atomic_t netstamp_wanted;
1813static void netstamp_clear(struct work_struct *work)
1814{
1815        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1816        int wanted;
1817
1818        wanted = atomic_add_return(deferred, &netstamp_wanted);
1819        if (wanted > 0)
1820                static_branch_enable(&netstamp_needed_key);
1821        else
1822                static_branch_disable(&netstamp_needed_key);
1823}
1824static DECLARE_WORK(netstamp_work, netstamp_clear);
1825#endif
1826
1827void net_enable_timestamp(void)
1828{
1829#ifdef HAVE_JUMP_LABEL
1830        int wanted;
1831
1832        while (1) {
1833                wanted = atomic_read(&netstamp_wanted);
1834                if (wanted <= 0)
1835                        break;
1836                if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1837                        return;
1838        }
1839        atomic_inc(&netstamp_needed_deferred);
1840        schedule_work(&netstamp_work);
1841#else
1842        static_branch_inc(&netstamp_needed_key);
1843#endif
1844}
1845EXPORT_SYMBOL(net_enable_timestamp);
1846
1847void net_disable_timestamp(void)
1848{
1849#ifdef HAVE_JUMP_LABEL
1850        int wanted;
1851
1852        while (1) {
1853                wanted = atomic_read(&netstamp_wanted);
1854                if (wanted <= 1)
1855                        break;
1856                if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1857                        return;
1858        }
1859        atomic_dec(&netstamp_needed_deferred);
1860        schedule_work(&netstamp_work);
1861#else
1862        static_branch_dec(&netstamp_needed_key);
1863#endif
1864}
1865EXPORT_SYMBOL(net_disable_timestamp);
1866
1867static inline void net_timestamp_set(struct sk_buff *skb)
1868{
1869        skb->tstamp = 0;
1870        if (static_branch_unlikely(&netstamp_needed_key))
1871                __net_timestamp(skb);
1872}
1873
1874#define net_timestamp_check(COND, SKB)                          \
1875        if (static_branch_unlikely(&netstamp_needed_key)) {     \
1876                if ((COND) && !(SKB)->tstamp)                   \
1877                        __net_timestamp(SKB);                   \
1878        }                                                       \
1879
1880bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1881{
1882        unsigned int len;
1883
1884        if (!(dev->flags & IFF_UP))
1885                return false;
1886
1887        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1888        if (skb->len <= len)
1889                return true;
1890
1891        /* if TSO is enabled, we don't care about the length as the packet
1892         * could be forwarded without being segmented before
1893         */
1894        if (skb_is_gso(skb))
1895                return true;
1896
1897        return false;
1898}
1899EXPORT_SYMBOL_GPL(is_skb_forwardable);
1900
1901int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1902{
1903        int ret = ____dev_forward_skb(dev, skb);
1904
1905        if (likely(!ret)) {
1906                skb->protocol = eth_type_trans(skb, dev);
1907                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1908        }
1909
1910        return ret;
1911}
1912EXPORT_SYMBOL_GPL(__dev_forward_skb);
1913
1914/**
1915 * dev_forward_skb - loopback an skb to another netif
1916 *
1917 * @dev: destination network device
1918 * @skb: buffer to forward
1919 *
1920 * return values:
1921 *      NET_RX_SUCCESS  (no congestion)
1922 *      NET_RX_DROP     (packet was dropped, but freed)
1923 *
1924 * dev_forward_skb can be used for injecting an skb from the
1925 * start_xmit function of one device into the receive queue
1926 * of another device.
1927 *
1928 * The receiving device may be in another namespace, so
1929 * we have to clear all information in the skb that could
1930 * impact namespace isolation.
1931 */
1932int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1933{
1934        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1935}
1936EXPORT_SYMBOL_GPL(dev_forward_skb);
1937
1938static inline int deliver_skb(struct sk_buff *skb,
1939                              struct packet_type *pt_prev,
1940                              struct net_device *orig_dev)
1941{
1942        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
1943                return -ENOMEM;
1944        refcount_inc(&skb->users);
1945        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1946}
1947
1948static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1949                                          struct packet_type **pt,
1950                                          struct net_device *orig_dev,
1951                                          __be16 type,
1952                                          struct list_head *ptype_list)
1953{
1954        struct packet_type *ptype, *pt_prev = *pt;
1955
1956        list_for_each_entry_rcu(ptype, ptype_list, list) {
1957                if (ptype->type != type)
1958                        continue;
1959                if (pt_prev)
1960                        deliver_skb(skb, pt_prev, orig_dev);
1961                pt_prev = ptype;
1962        }
1963        *pt = pt_prev;
1964}
1965
1966static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1967{
1968        if (!ptype->af_packet_priv || !skb->sk)
1969                return false;
1970
1971        if (ptype->id_match)
1972                return ptype->id_match(ptype, skb->sk);
1973        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1974                return true;
1975
1976        return false;
1977}
1978
1979/*
1980 *      Support routine. Sends outgoing frames to any network
1981 *      taps currently in use.
1982 */
1983
1984void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1985{
1986        struct packet_type *ptype;
1987        struct sk_buff *skb2 = NULL;
1988        struct packet_type *pt_prev = NULL;
1989        struct list_head *ptype_list = &ptype_all;
1990
1991        rcu_read_lock();
1992again:
1993        list_for_each_entry_rcu(ptype, ptype_list, list) {
1994                /* Never send packets back to the socket
1995                 * they originated from - MvS (miquels@drinkel.ow.org)
1996                 */
1997                if (skb_loop_sk(ptype, skb))
1998                        continue;
1999
2000                if (pt_prev) {

2001                        deliver_skb(skb2, pt_prev, skb->dev);
2002                        pt_prev = ptype;
2003                        continue;
2004                }
2005
2006                /* need to clone skb, done only once */
2007                skb2 = skb_clone(skb, GFP_ATOMIC);
2008                if (!skb2)
2009                        goto out_unlock;
2010
2011                net_timestamp_set(skb2);
2012
2013                /* skb->nh should be correctly
2014                 * set by sender, so that the second statement is
2015                 * just protection against buggy protocols.
2016                 */
2017                skb_reset_mac_header(skb2);
2018
2019                if (skb_network_header(skb2) < skb2->data ||
2020                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2021                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2022                                             ntohs(skb2->protocol),
2023                                             dev->name);
2024                        skb_reset_network_header(skb2);
2025                }
2026
2027                skb2->transport_header = skb2->network_header;
2028                skb2->pkt_type = PACKET_OUTGOING;
2029                pt_prev = ptype;
2030        }
2031
2032        if (ptype_list == &ptype_all) {
2033                ptype_list = &dev->ptype_all;
2034                goto again;
2035        }
2036out_unlock:
2037        if (pt_prev) {
2038                if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2039                        pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2040                else
2041                        kfree_skb(skb2);
2042        }
2043        rcu_read_unlock();
2044}
2045EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2046
2047/**
2048 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2049 * @dev: Network device
2050 * @txq: number of queues available
2051 *
2052 * If real_num_tx_queues is changed the tc mappings may no longer be
2053 * valid. To resolve this verify the tc mapping remains valid and if
2054 * not NULL the mapping. With no priorities mapping to this
2055 * offset/count pair it will no longer be used. In the worst case TC0
2056 * is invalid nothing can be done so disable priority mappings. If is
2057 * expected that drivers will fix this mapping if they can before
2058 * calling netif_set_real_num_tx_queues.
2059 */
2060static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2061{
2062        int i;
2063        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2064
2065        /* If TC0 is invalidated disable TC mapping */
2066        if (tc->offset + tc->count > txq) {
2067                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2068                dev->num_tc = 0;
2069                return;
2070        }
2071
2072        /* Invalidated prio to tc mappings set to TC0 */
2073        for (i = 1; i < TC_BITMASK + 1; i++) {
2074                int q = netdev_get_prio_tc_map(dev, i);
2075
2076                tc = &dev->tc_to_txq[q];
2077                if (tc->offset + tc->count > txq) {
2078                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2079                                i, q);
2080                        netdev_set_prio_tc_map(dev, i, 0);
2081                }
2082        }
2083}
2084
2085int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2086{
2087        if (dev->num_tc) {
2088                struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2089                int i;
2090
2091                /* walk through the TCs and see if it falls into any of them */
2092                for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2093                        if ((txq - tc->offset) < tc->count)
2094                                return i;
2095                }
2096
2097                /* didn't find it, just return -1 to indicate no match */
2098                return -1;
2099        }
2100
2101        return 0;
2102}
2103EXPORT_SYMBOL(netdev_txq_to_tc);
2104
2105#ifdef CONFIG_XPS
2106struct static_key xps_needed __read_mostly;
2107EXPORT_SYMBOL(xps_needed);
2108struct static_key xps_rxqs_needed __read_mostly;
2109EXPORT_SYMBOL(xps_rxqs_needed);
2110static DEFINE_MUTEX(xps_map_mutex);
2111#define xmap_dereference(P)             \
2112        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2113
2114static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2115                             int tci, u16 index)
2116{
2117        struct xps_map *map = NULL;
2118        int pos;
2119
2120        if (dev_maps)
2121                map = xmap_dereference(dev_maps->attr_map[tci]);
2122        if (!map)
2123                return false;
2124
2125        for (pos = map->len; pos--;) {
2126                if (map->queues[pos] != index)
2127                        continue;
2128
2129                if (map->len > 1) {
2130                        map->queues[pos] = map->queues[--map->len];
2131                        break;
2132                }
2133
2134                RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2135                kfree_rcu(map, rcu);
2136                return false;
2137        }
2138
2139        return true;
2140}
2141
2142static bool remove_xps_queue_cpu(struct net_device *dev,
2143                                 struct xps_dev_maps *dev_maps,
2144                                 int cpu, u16 offset, u16 count)
2145{
2146        int num_tc = dev->num_tc ? : 1;
2147        bool active = false;
2148        int tci;
2149
2150        for (tci = cpu * num_tc; num_tc--; tci++) {
2151                int i, j;
2152
2153                for (i = count, j = offset; i--; j++) {
2154                        if (!remove_xps_queue(dev_maps, tci, j))
2155                                break;
2156                }
2157
2158                active |= i < 0;
2159        }
2160
2161        return active;
2162}
2163
2164static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
2165                           struct xps_dev_maps *dev_maps, unsigned int nr_ids,
2166                           u16 offset, u16 count, bool is_rxqs_map)
2167{
2168        bool active = false;
2169        int i, j;
2170
2171        for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
2172             j < nr_ids;)
2173                active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
2174                                               count);
2175        if (!active) {
2176                if (is_rxqs_map) {
2177                        RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2178                } else {
2179                        RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2180
2181                        for (i = offset + (count - 1); count--; i--)
2182                                netdev_queue_numa_node_write(
2183                                        netdev_get_tx_queue(dev, i),
2184                                                        NUMA_NO_NODE);
2185                }
2186                kfree_rcu(dev_maps, rcu);
2187        }
2188}
2189
2190static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2191                                   u16 count)
2192{
2193        const unsigned long *possible_mask = NULL;
2194        struct xps_dev_maps *dev_maps;
2195        unsigned int nr_ids;
2196
2197        if (!static_key_false(&xps_needed))
2198                return;
2199
2200        cpus_read_lock();
2201        mutex_lock(&xps_map_mutex);
2202
2203        if (static_key_false(&xps_rxqs_needed)) {
2204                dev_maps = xmap_dereference(dev->xps_rxqs_map);
2205                if (dev_maps) {
2206                        nr_ids = dev->num_rx_queues;
2207                        clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
2208                                       offset, count, true);
2209                }
2210        }
2211
2212        dev_maps = xmap_dereference(dev->xps_cpus_map);
2213        if (!dev_maps)
2214                goto out_no_maps;
2215
2216        if (num_possible_cpus() > 1)
2217                possible_mask = cpumask_bits(cpu_possible_mask);
2218        nr_ids = nr_cpu_ids;
2219        clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
2220                       false);
2221
2222out_no_maps:
2223        if (static_key_enabled(&xps_rxqs_needed))
2224                static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2225
2226        static_key_slow_dec_cpuslocked(&xps_needed);
2227        mutex_unlock(&xps_map_mutex);
2228        cpus_read_unlock();
2229}
2230
2231static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2232{
2233        netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2234}
2235
2236static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2237                                      u16 index, bool is_rxqs_map)
2238{
2239        struct xps_map *new_map;
2240        int alloc_len = XPS_MIN_MAP_ALLOC;
2241        int i, pos;
2242
2243        for (pos = 0; map && pos < map->len; pos++) {
2244                if (map->queues[pos] != index)
2245                        continue;
2246                return map;
2247        }
2248
2249        /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2250        if (map) {
2251                if (pos < map->alloc_len)
2252                        return map;
2253
2254                alloc_len = map->alloc_len * 2;
2255        }
2256
2257        /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2258         *  map
2259         */
2260        if (is_rxqs_map)
2261                new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2262        else
2263                new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2264                                       cpu_to_node(attr_index));
2265        if (!new_map)
2266                return NULL;
2267
2268        for (i = 0; i < pos; i++)
2269                new_map->queues[i] = map->queues[i];
2270        new_map->alloc_len = alloc_len;
2271        new_map->len = pos;
2272
2273        return new_map;
2274}
2275
2276/* Must be called under cpus_read_lock */
2277int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2278                          u16 index, bool is_rxqs_map)
2279{
2280        const unsigned long *online_mask = NULL, *possible_mask = NULL;
2281        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2282        int i, j, tci, numa_node_id = -2;
2283        int maps_sz, num_tc = 1, tc = 0;
2284        struct xps_map *map, *new_map;
2285        bool active = false;
2286        unsigned int nr_ids;
2287
2288        if (dev->num_tc) {
2289                /* Do not allow XPS on subordinate device directly */
2290                num_tc = dev->num_tc;
2291                if (num_tc < 0)
2292                        return -EINVAL;
2293
2294                /* If queue belongs to subordinate dev use its map */
2295                dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2296
2297                tc = netdev_txq_to_tc(dev, index);
2298                if (tc < 0)
2299                        return -EINVAL;
2300        }
2301
2302        mutex_lock(&xps_map_mutex);
2303        if (is_rxqs_map) {
2304                maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2305                dev_maps = xmap_dereference(dev->xps_rxqs_map);
2306                nr_ids = dev->num_rx_queues;
2307        } else {
2308                maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2309                if (num_possible_cpus() > 1) {
2310                        online_mask = cpumask_bits(cpu_online_mask);
2311                        possible_mask = cpumask_bits(cpu_possible_mask);
2312                }
2313                dev_maps = xmap_dereference(dev->xps_cpus_map);
2314                nr_ids = nr_cpu_ids;
2315        }
2316
2317        if (maps_sz < L1_CACHE_BYTES)
2318                maps_sz = L1_CACHE_BYTES;
2319
2320        /* allocate memory for queue storage */
2321        for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2322             j < nr_ids;) {
2323                if (!new_dev_maps)
2324                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2325                if (!new_dev_maps) {
2326                        mutex_unlock(&xps_map_mutex);
2327                        return -ENOMEM;
2328                }
2329
2330                tci = j * num_tc + tc;
2331                map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2332                                 NULL;
2333
2334                map = expand_xps_map(map, j, index, is_rxqs_map);
2335                if (!map)
2336                        goto error;
2337
2338                RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2339        }
2340
2341        if (!new_dev_maps)
2342                goto out_no_new_maps;
2343
2344        static_key_slow_inc_cpuslocked(&xps_needed);
2345        if (is_rxqs_map)
2346                static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2347
2348        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2349             j < nr_ids;) {
2350                /* copy maps belonging to foreign traffic classes */
2351                for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2352                        /* fill in the new device map from the old device map */
2353                        map = xmap_dereference(dev_maps->attr_map[tci]);
2354                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2355                }
2356
2357                /* We need to explicitly update tci as prevous loop
2358                 * could break out early if dev_maps is NULL.
2359                 */
2360                tci = j * num_tc + tc;
2361
2362                if (netif_attr_test_mask(j, mask, nr_ids) &&
2363                    netif_attr_test_online(j, online_mask, nr_ids)) {
2364                        /* add tx-queue to CPU/rx-queue maps */
2365                        int pos = 0;
2366
2367                        map = xmap_dereference(new_dev_maps->attr_map[tci]);
2368                        while ((pos < map->len) && (map->queues[pos] != index))
2369                                pos++;
2370
2371                        if (pos == map->len)
2372                                map->queues[map->len++] = index;
2373#ifdef CONFIG_NUMA
2374                        if (!is_rxqs_map) {
2375                                if (numa_node_id == -2)
2376                                        numa_node_id = cpu_to_node(j);
2377                                else if (numa_node_id != cpu_to_node(j))
2378                                        numa_node_id = -1;
2379                        }
2380#endif
2381                } else if (dev_maps) {
2382                        /* fill in the new device map from the old device map */
2383                        map = xmap_dereference(dev_maps->attr_map[tci]);
2384                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2385                }
2386
2387                /* copy maps belonging to foreign traffic classes */
2388                for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2389                        /* fill in the new device map from the old device map */
2390                        map = xmap_dereference(dev_maps->attr_map[tci]);
2391                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2392                }
2393        }
2394
2395        if (is_rxqs_map)
2396                rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
2397        else
2398                rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2399
2400        /* Cleanup old maps */
2401        if (!dev_maps)
2402                goto out_no_old_maps;
2403
2404        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2405             j < nr_ids;) {
2406                for (i = num_tc, tci = j * num_tc; i--; tci++) {
2407                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2408                        map = xmap_dereference(dev_maps->attr_map[tci]);
2409                        if (map && map != new_map)
2410                                kfree_rcu(map, rcu);
2411                }
2412        }
2413
2414        kfree_rcu(dev_maps, rcu);
2415
2416out_no_old_maps:
2417        dev_maps = new_dev_maps;
2418        active = true;
2419
2420out_no_new_maps:
2421        if (!is_rxqs_map) {
2422                /* update Tx queue numa node */
2423                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2424                                             (numa_node_id >= 0) ?
2425                                             numa_node_id : NUMA_NO_NODE);
2426        }
2427
2428        if (!dev_maps)
2429                goto out_no_maps;
2430
2431        /* removes tx-queue from unused CPUs/rx-queues */
2432        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2433             j < nr_ids;) {
2434                for (i = tc, tci = j * num_tc; i--; tci++)
2435                        active |= remove_xps_queue(dev_maps, tci, index);
2436                if (!netif_attr_test_mask(j, mask, nr_ids) ||
2437                    !netif_attr_test_online(j, online_mask, nr_ids))
2438                        active |= remove_xps_queue(dev_maps, tci, index);
2439                for (i = num_tc - tc, tci++; --i; tci++)
2440                        active |= remove_xps_queue(dev_maps, tci, index);
2441        }
2442
2443        /* free map if not active */
2444        if (!active) {
2445                if (is_rxqs_map)
2446                        RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2447                else
2448                        RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2449                kfree_rcu(dev_maps, rcu);
2450        }
2451
2452out_no_maps:
2453        mutex_unlock(&xps_map_mutex);
2454
2455        return 0;
2456error:
2457        /* remove any maps that we added */
2458        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2459             j < nr_ids;) {
2460                for (i = num_tc, tci = j * num_tc; i--; tci++) {
2461                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2462                        map = dev_maps ?
2463                              xmap_dereference(dev_maps->attr_map[tci]) :
2464                              NULL;
2465                        if (new_map && new_map != map)
2466                                kfree(new_map);
2467                }
2468        }
2469
2470        mutex_unlock(&xps_map_mutex);
2471
2472        kfree(new_dev_maps);
2473        return -ENOMEM;
2474}
2475EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2476
2477int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2478                        u16 index)
2479{
2480        int ret;
2481
2482        cpus_read_lock();
2483        ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
2484        cpus_read_unlock();
2485
2486        return ret;
2487}
2488EXPORT_SYMBOL(netif_set_xps_queue);
2489
2490#endif
2491static void netdev_unbind_all_sb_channels(struct net_device *dev)
2492{
2493        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2494
2495        /* Unbind any subordinate channels */
2496        while (txq-- != &dev->_tx[0]) {
2497                if (txq->sb_dev)
2498                        netdev_unbind_sb_channel(dev, txq->sb_dev);
2499        }
2500}
2501
2502void netdev_reset_tc(struct net_device *dev)
2503{
2504#ifdef CONFIG_XPS
2505        netif_reset_xps_queues_gt(dev, 0);
2506#endif
2507        netdev_unbind_all_sb_channels(dev);
2508
2509        /* Reset TC configuration of device */
2510        dev->num_tc = 0;
2511        memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2512        memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2513}
2514EXPORT_SYMBOL(netdev_reset_tc);
2515
2516int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2517{
2518        if (tc >= dev->num_tc)
2519                return -EINVAL;
2520
2521#ifdef CONFIG_XPS
2522        netif_reset_xps_queues(dev, offset, count);
2523#endif
2524        dev->tc_to_txq[tc].count = count;
2525        dev->tc_to_txq[tc].offset = offset;
2526        return 0;
2527}
2528EXPORT_SYMBOL(netdev_set_tc_queue);
2529
2530int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2531{
2532        if (num_tc > TC_MAX_QUEUE)
2533                return -EINVAL;
2534
2535#ifdef CONFIG_XPS
2536        netif_reset_xps_queues_gt(dev, 0);
2537#endif
2538        netdev_unbind_all_sb_channels(dev);
2539
2540        dev->num_tc = num_tc;
2541        return 0;
2542}
2543EXPORT_SYMBOL(netdev_set_num_tc);
2544
2545void netdev_unbind_sb_channel(struct net_device *dev,
2546                              struct net_device *sb_dev)
2547{
2548        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2549
2550#ifdef CONFIG_XPS
2551        netif_reset_xps_queues_gt(sb_dev, 0);
2552#endif
2553        memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2554        memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2555
2556        while (txq-- != &dev->_tx[0]) {
2557                if (txq->sb_dev == sb_dev)
2558                        txq->sb_dev = NULL;
2559        }
2560}
2561EXPORT_SYMBOL(netdev_unbind_sb_channel);
2562
2563int netdev_bind_sb_channel_queue(struct net_device *dev,
2564                                 struct net_device *sb_dev,
2565                                 u8 tc, u16 count, u16 offset)
2566{
2567        /* Make certain the sb_dev and dev are already configured */
2568        if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2569                return -EINVAL;
2570
2571        /* We cannot hand out queues we don't have */
2572        if ((offset + count) > dev->real_num_tx_queues)
2573                return -EINVAL;
2574
2575        /* Record the mapping */
2576        sb_dev->tc_to_txq[tc].count = count;
2577        sb_dev->tc_to_txq[tc].offset = offset;
2578
2579        /* Provide a way for Tx queue to find the tc_to_txq map or
2580         * XPS map for itself.
2581         */
2582        while (count--)
2583                netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2584
2585        return 0;
2586}
2587EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2588
2589int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2590{
2591        /* Do not use a multiqueue device to represent a subordinate channel */
2592        if (netif_is_multiqueue(dev))
2593                return -ENODEV;
2594
2595        /* We allow channels 1 - 32767 to be used for subordinate channels.
2596         * Channel 0 is meant to be "native" mode and used only to represent
2597         * the main root device. We allow writing 0 to reset the device back
2598         * to normal mode after being used as a subordinate channel.
2599         */
2600        if (channel > S16_MAX)
2601                return -EINVAL;
2602
2603        dev->num_tc = -channel;
2604
2605        return 0;
2606}
2607EXPORT_SYMBOL(netdev_set_sb_channel);
2608
2609/*
2610 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2611 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2612 */
2613int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2614{
2615        bool disabling;
2616        int rc;
2617
2618        disabling = txq < dev->real_num_tx_queues;
2619
2620        if (txq < 1 || txq > dev->num_tx_queues)
2621                return -EINVAL;
2622
2623        if (dev->reg_state == NETREG_REGISTERED ||
2624            dev->reg_state == NETREG_UNREGISTERING) {
2625                ASSERT_RTNL();
2626
2627                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2628                                                  txq);
2629                if (rc)
2630                        return rc;
2631
2632                if (dev->num_tc)
2633                        netif_setup_tc(dev, txq);
2634
2635                dev->real_num_tx_queues = txq;
2636
2637                if (disabling) {
2638                        synchronize_net();
2639                        qdisc_reset_all_tx_gt(dev, txq);
2640#ifdef CONFIG_XPS
2641                        netif_reset_xps_queues_gt(dev, txq);
2642#endif
2643                }
2644        } else {
2645                dev->real_num_tx_queues = txq;
2646        }
2647
2648        return 0;
2649}
2650EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2651
2652#ifdef CONFIG_SYSFS
2653/**
2654 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2655 *      @dev: Network device
2656 *      @rxq: Actual number of RX queues
2657 *
2658 *      This must be called either with the rtnl_lock held or before
2659 *      registration of the net device.  Returns 0 on success, or a
2660 *      negative error code.  If called before registration, it always
2661 *      succeeds.
2662 */
2663int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2664{
2665        int rc;
2666
2667        if (rxq < 1 || rxq > dev->num_rx_queues)
2668                return -EINVAL;
2669
2670        if (dev->reg_state == NETREG_REGISTERED) {
2671                ASSERT_RTNL();
2672
2673                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2674                                                  rxq);
2675                if (rc)
2676                        return rc;
2677        }
2678
2679        dev->real_num_rx_queues = rxq;
2680        return 0;
2681}
2682EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2683#endif
2684
2685/**
2686 * netif_get_num_default_rss_queues - default number of RSS queues
2687 *
2688 * This routine should set an upper limit on the number of RSS queues
2689 * used by default by multiqueue devices.
2690 */
2691int netif_get_num_default_rss_queues(void)
2692{
2693        return is_kdump_kernel() ?
2694                1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2695}
2696EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2697
2698static void __netif_reschedule(struct Qdisc *q)
2699{
2700        struct softnet_data *sd;
2701        unsigned long flags;
2702
2703        local_irq_save(flags);
2704        sd = this_cpu_ptr(&softnet_data);
2705        q->next_sched = NULL;
2706        *sd->output_queue_tailp = q;
2707        sd->output_queue_tailp = &q->next_sched;
2708        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2709        local_irq_restore(flags);
2710}
2711
2712void __netif_schedule(struct Qdisc *q)
2713{
2714        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2715                __netif_reschedule(q);
2716}
2717EXPORT_SYMBOL(__netif_schedule);
2718
2719struct dev_kfree_skb_cb {
2720        enum skb_free_reason reason;
2721};
2722
2723static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2724{
2725        return (struct dev_kfree_skb_cb *)skb->cb;
2726}
2727
2728void netif_schedule_queue(struct netdev_queue *txq)
2729{
2730        rcu_read_lock();
2731        if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2732                struct Qdisc *q = rcu_dereference(txq->qdisc);
2733
2734                __netif_schedule(q);
2735        }
2736        rcu_read_unlock();
2737}
2738EXPORT_SYMBOL(netif_schedule_queue);
2739
2740void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2741{
2742        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2743                struct Qdisc *q;
2744
2745                rcu_read_lock();
2746                q = rcu_dereference(dev_queue->qdisc);
2747                __netif_schedule(q);
2748                rcu_read_unlock();
2749        }
2750}
2751EXPORT_SYMBOL(netif_tx_wake_queue);
2752
2753void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2754{
2755        unsigned long flags;
2756
2757        if (unlikely(!skb))
2758                return;
2759
2760        if (likely(refcount_read(&skb->users) == 1)) {
2761                smp_rmb();
2762                refcount_set(&skb->users, 0);
2763        } else if (likely(!refcount_dec_and_test(&skb->users))) {
2764                return;
2765        }
2766        get_kfree_skb_cb(skb)->reason = reason;
2767        local_irq_save(flags);
2768        skb->next = __this_cpu_read(softnet_data.completion_queue);
2769        __this_cpu_write(softnet_data.completion_queue, skb);
2770        raise_softirq_irqoff(NET_TX_SOFTIRQ);
2771        local_irq_restore(flags);
2772}
2773EXPORT_SYMBOL(__dev_kfree_skb_irq);
2774
2775void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2776{
2777        if (in_irq() || irqs_disabled())
2778                __dev_kfree_skb_irq(skb, reason);
2779        else
2780                dev_kfree_skb(skb);
2781}
2782EXPORT_SYMBOL(__dev_kfree_skb_any);
2783
2784
2785/**
2786 * netif_device_detach - mark device as removed
2787 * @dev: network device
2788 *
2789 * Mark device as removed from system and therefore no longer available.
2790 */
2791void netif_device_detach(struct net_device *dev)
2792{
2793        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2794            netif_running(dev)) {
2795                netif_tx_stop_all_queues(dev);
2796        }
2797}
2798EXPORT_SYMBOL(netif_device_detach);
2799
2800/**
2801 * netif_device_attach - mark device as attached
2802 * @dev: network device
2803 *
2804 * Mark device as attached from system and restart if needed.
2805 */
2806void netif_device_attach(struct net_device *dev)
2807{
2808        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2809            netif_running(dev)) {
2810                netif_tx_wake_all_queues(dev);
2811                __netdev_watchdog_up(dev);
2812        }
2813}
2814EXPORT_SYMBOL(netif_device_attach);
2815
2816/*
2817 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2818 * to be used as a distribution range.
2819 */
2820static u16 skb_tx_hash(const struct net_device *dev,
2821                       const struct net_device *sb_dev,
2822                       struct sk_buff *skb)
2823{
2824        u32 hash;
2825        u16 qoffset = 0;
2826        u16 qcount = dev->real_num_tx_queues;
2827
2828        if (dev->num_tc) {
2829                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2830
2831                qoffset = sb_dev->tc_to_txq[tc].offset;
2832                qcount = sb_dev->tc_to_txq[tc].count;
2833        }
2834
2835        if (skb_rx_queue_recorded(skb)) {
2836                hash = skb_get_rx_queue(skb);
2837                while (unlikely(hash >= qcount))
2838                        hash -= qcount;
2839                return hash + qoffset;
2840        }
2841
2842        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2843}
2844
2845static void skb_warn_bad_offload(const struct sk_buff *skb)
2846{
2847        static const netdev_features_t null_features;
2848        struct net_device *dev = skb->dev;
2849        const char *name = "";
2850
2851        if (!net_ratelimit())
2852                return;
2853
2854        if (dev) {
2855                if (dev->dev.parent)
2856                        name = dev_driver_string(dev->dev.parent);
2857                else
2858                        name = netdev_name(dev);
2859        }
2860        WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2861             "gso_type=%d ip_summed=%d\n",
2862             name, dev ? &dev->features : &null_features,
2863             skb->sk ? &skb->sk->sk_route_caps : &null_features,
2864             skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2865             skb_shinfo(skb)->gso_type, skb->ip_summed);
2866}
2867
2868/*
2869 * Invalidate hardware checksum when packet is to be mangled, and
2870 * complete checksum manually on outgoing path.
2871 */
2872int skb_checksum_help(struct sk_buff *skb)
2873{
2874        __wsum csum;
2875        int ret = 0, offset;
2876
2877        if (skb->ip_summed == CHECKSUM_COMPLETE)
2878                goto out_set_summed;
2879
2880        if (unlikely(skb_shinfo(skb)->gso_size)) {
2881                skb_warn_bad_offload(skb);
2882                return -EINVAL;
2883        }
2884
2885        /* Before computing a checksum, we should make sure no frag could
2886         * be modified by an external entity : checksum could be wrong.
2887         */
2888        if (skb_has_shared_frag(skb)) {
2889                ret = __skb_linearize(skb);
2890                if (ret)
2891                        goto out;
2892        }
2893
2894        offset = skb_checksum_start_offset(skb);
2895        BUG_ON(offset >= skb_headlen(skb));
2896        csum = skb_checksum(skb, offset, skb->len - offset, 0);
2897
2898        offset += skb->csum_offset;
2899        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2900
2901        if (skb_cloned(skb) &&
2902            !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2903                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2904                if (ret)
2905                        goto out;
2906        }
2907
2908        *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2909out_set_summed:
2910        skb->ip_summed = CHECKSUM_NONE;
2911out:
2912        return ret;
2913}
2914EXPORT_SYMBOL(skb_checksum_help);
2915
2916int skb_crc32c_csum_help(struct sk_buff *skb)
2917{
2918        __le32 crc32c_csum;
2919        int ret = 0, offset, start;
2920
2921        if (skb->ip_summed != CHECKSUM_PARTIAL)
2922                goto out;
2923
2924        if (unlikely(skb_is_gso(skb)))
2925                goto out;
2926
2927        /* Before computing a checksum, we should make sure no frag could
2928         * be modified by an external entity : checksum could be wrong.
2929         */
2930        if (unlikely(skb_has_shared_frag(skb))) {
2931                ret = __skb_linearize(skb);
2932                if (ret)
2933                        goto out;
2934        }
2935        start = skb_checksum_start_offset(skb);
2936        offset = start + offsetof(struct sctphdr, checksum);
2937        if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
2938                ret = -EINVAL;
2939                goto out;
2940        }
2941        if (skb_cloned(skb) &&
2942            !skb_clone_writable(skb, offset + sizeof(__le32))) {
2943                ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2944                if (ret)
2945                        goto out;
2946        }
2947        crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
2948                                                  skb->len - start, ~(__u32)0,
2949                                                  crc32c_csum_stub));
2950        *(__le32 *)(skb->data + offset) = crc32c_csum;
2951        skb->ip_summed = CHECKSUM_NONE;
2952        skb->csum_not_inet = 0;
2953out:
2954        return ret;
2955}
2956
2957__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2958{
2959        __be16 type = skb->protocol;
2960
2961        /* Tunnel gso handlers can set protocol to ethernet. */
2962        if (type == htons(ETH_P_TEB)) {
2963                struct ethhdr *eth;
2964
2965                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2966                        return 0;
2967
2968                eth = (struct ethhdr *)skb->data;
2969                type = eth->h_proto;
2970        }
2971
2972        return __vlan_get_protocol(skb, type, depth);
2973}
2974
2975/**
2976 *      skb_mac_gso_segment - mac layer segmentation handler.
2977 *      @skb: buffer to segment
2978 *      @features: features for the output path (see dev->features)
2979 */
2980struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2981                                    netdev_features_t features)
2982{
2983        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2984        struct packet_offload *ptype;
2985        int vlan_depth = skb->mac_len;
2986        __be16 type = skb_network_protocol(skb, &vlan_depth);
2987
2988        if (unlikely(!type))
2989                return ERR_PTR(-EINVAL);
2990
2991        __skb_pull(skb, vlan_depth);
2992
2993        rcu_read_lock();
2994        list_for_each_entry_rcu(ptype, &offload_base, list) {
2995                if (ptype->type == type && ptype->callbacks.gso_segment) {
2996                        segs = ptype->callbacks.gso_segment(skb, features);
2997                        break;
2998                }
2999        }
3000        rcu_read_unlock();

3001
3002        __skb_push(skb, skb->data - skb_mac_header(skb));
3003
3004        return segs;
3005}
3006EXPORT_SYMBOL(skb_mac_gso_segment);
3007
3008
3009/* openvswitch calls this on rx path, so we need a different check.
3010 */
3011static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
3012{
3013        if (tx_path)
3014                return skb->ip_summed != CHECKSUM_PARTIAL &&
3015                       skb->ip_summed != CHECKSUM_UNNECESSARY;
3016
3017        return skb->ip_summed == CHECKSUM_NONE;
3018}
3019
3020/**
3021 *      __skb_gso_segment - Perform segmentation on skb.
3022 *      @skb: buffer to segment
3023 *      @features: features for the output path (see dev->features)
3024 *      @tx_path: whether it is called in TX path
3025 *
3026 *      This function segments the given skb and returns a list of segments.
3027 *
3028 *      It may return NULL if the skb requires no segmentation.  This is
3029 *      only possible when GSO is used for verifying header integrity.
3030 *
3031 *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
3032 */
3033struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
3034                                  netdev_features_t features, bool tx_path)
3035{
3036        struct sk_buff *segs;
3037
3038        if (unlikely(skb_needs_check(skb, tx_path))) {
3039                int err;
3040
3041                /* We're going to init ->check field in TCP or UDP header */
3042                err = skb_cow_head(skb, 0);
3043                if (err < 0)
3044                        return ERR_PTR(err);
3045        }
3046
3047        /* Only report GSO partial support if it will enable us to
3048         * support segmentation on this frame without needing additional
3049         * work.
3050         */
3051        if (features & NETIF_F_GSO_PARTIAL) {
3052                netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3053                struct net_device *dev = skb->dev;
3054
3055                partial_features |= dev->features & dev->gso_partial_features;
3056                if (!skb_gso_ok(skb, features | partial_features))
3057                        features &= ~NETIF_F_GSO_PARTIAL;
3058        }
3059
3060        BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
3061                     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3062
3063        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3064        SKB_GSO_CB(skb)->encap_level = 0;
3065
3066        skb_reset_mac_header(skb);
3067        skb_reset_mac_len(skb);
3068
3069        segs = skb_mac_gso_segment(skb, features);
3070
3071        if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3072                skb_warn_bad_offload(skb);
3073
3074        return segs;
3075}
3076EXPORT_SYMBOL(__skb_gso_segment);
3077
3078/* Take action when hardware reception checksum errors are detected. */
3079#ifdef CONFIG_BUG
3080void netdev_rx_csum_fault(struct net_device *dev)
3081{
3082        if (net_ratelimit()) {
3083                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3084                dump_stack();
3085        }
3086}
3087EXPORT_SYMBOL(netdev_rx_csum_fault);
3088#endif
3089
3090/* XXX: check that highmem exists at all on the given machine. */
3091static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3092{
3093#ifdef CONFIG_HIGHMEM
3094        int i;
3095
3096        if (!(dev->features & NETIF_F_HIGHDMA)) {
3097                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3098                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3099
3100                        if (PageHighMem(skb_frag_page(frag)))
3101                                return 1;
3102                }
3103        }
3104#endif
3105        return 0;
3106}
3107
3108/* If MPLS offload request, verify we are testing hardware MPLS features
3109 * instead of standard features for the netdev.
3110 */
3111#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3112static netdev_features_t net_mpls_features(struct sk_buff *skb,
3113                                           netdev_features_t features,
3114                                           __be16 type)
3115{
3116        if (eth_p_mpls(type))
3117                features &= skb->dev->mpls_features;
3118
3119        return features;
3120}
3121#else
3122static netdev_features_t net_mpls_features(struct sk_buff *skb,
3123                                           netdev_features_t features,
3124                                           __be16 type)
3125{
3126        return features;
3127}
3128#endif
3129
3130static netdev_features_t harmonize_features(struct sk_buff *skb,
3131        netdev_features_t features)
3132{
3133        int tmp;
3134        __be16 type;
3135
3136        type = skb_network_protocol(skb, &tmp);
3137        features = net_mpls_features(skb, features, type);
3138
3139        if (skb->ip_summed != CHECKSUM_NONE &&
3140            !can_checksum_protocol(features, type)) {
3141                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3142        }
3143        if (illegal_highdma(skb->dev, skb))
3144                features &= ~NETIF_F_SG;
3145
3146        return features;
3147}
3148
3149netdev_features_t passthru_features_check(struct sk_buff *skb,
3150                                          struct net_device *dev,
3151                                          netdev_features_t features)
3152{
3153        return features;
3154}
3155EXPORT_SYMBOL(passthru_features_check);
3156
3157static netdev_features_t dflt_features_check(struct sk_buff *skb,
3158                                             struct net_device *dev,
3159                                             netdev_features_t features)
3160{
3161        return vlan_features_check(skb, features);
3162}
3163
3164static netdev_features_t gso_features_check(const struct sk_buff *skb,
3165                                            struct net_device *dev,
3166                                            netdev_features_t features)
3167{
3168        u16 gso_segs = skb_shinfo(skb)->gso_segs;
3169
3170        if (gso_segs > dev->gso_max_segs)
3171                return features & ~NETIF_F_GSO_MASK;
3172
3173        /* Support for GSO partial features requires software
3174         * intervention before we can actually process the packets
3175         * so we need to strip support for any partial features now
3176         * and we can pull them back in after we have partially
3177         * segmented the frame.
3178         */
3179        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3180                features &= ~dev->gso_partial_features;
3181
3182        /* Make sure to clear the IPv4 ID mangling feature if the
3183         * IPv4 header has the potential to be fragmented.
3184         */
3185        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3186                struct iphdr *iph = skb->encapsulation ?
3187                                    inner_ip_hdr(skb) : ip_hdr(skb);
3188
3189                if (!(iph->frag_off & htons(IP_DF)))
3190                        features &= ~NETIF_F_TSO_MANGLEID;
3191        }
3192
3193        return features;
3194}
3195
3196netdev_features_t netif_skb_features(struct sk_buff *skb)
3197{
3198        struct net_device *dev = skb->dev;
3199        netdev_features_t features = dev->features;
3200
3201        if (skb_is_gso(skb))
3202                features = gso_features_check(skb, dev, features);
3203
3204        /* If encapsulation offload request, verify we are testing
3205         * hardware encapsulation features instead of standard
3206         * features for the netdev
3207         */
3208        if (skb->encapsulation)
3209                features &= dev->hw_enc_features;
3210
3211        if (skb_vlan_tagged(skb))
3212                features = netdev_intersect_features(features,
3213                                                     dev->vlan_features |
3214                                                     NETIF_F_HW_VLAN_CTAG_TX |
3215                                                     NETIF_F_HW_VLAN_STAG_TX);
3216
3217        if (dev->netdev_ops->ndo_features_check)
3218                features &= dev->netdev_ops->ndo_features_check(skb, dev,
3219                                                                features);
3220        else
3221                features &= dflt_features_check(skb, dev, features);
3222
3223        return harmonize_features(skb, features);
3224}
3225EXPORT_SYMBOL(netif_skb_features);
3226
3227static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3228                    struct netdev_queue *txq, bool more)
3229{
3230        unsigned int len;
3231        int rc;
3232
3233        if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
3234                dev_queue_xmit_nit(skb, dev);
3235
3236        len = skb->len;
3237        trace_net_dev_start_xmit(skb, dev);
3238        rc = netdev_start_xmit(skb, dev, txq, more);
3239        trace_net_dev_xmit(skb, rc, dev, len);
3240
3241        return rc;
3242}
3243
3244struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3245                                    struct netdev_queue *txq, int *ret)
3246{
3247        struct sk_buff *skb = first;
3248        int rc = NETDEV_TX_OK;
3249
3250        while (skb) {
3251                struct sk_buff *next = skb->next;
3252
3253                skb->next = NULL;
3254                rc = xmit_one(skb, dev, txq, next != NULL);
3255                if (unlikely(!dev_xmit_complete(rc))) {
3256                        skb->next = next;
3257                        goto out;
3258                }
3259
3260                skb = next;
3261                if (netif_xmit_stopped(txq) && skb) {
3262                        rc = NETDEV_TX_BUSY;
3263                        break;
3264                }
3265        }
3266
3267out:
3268        *ret = rc;
3269        return skb;
3270}
3271
3272static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3273                                          netdev_features_t features)
3274{
3275        if (skb_vlan_tag_present(skb) &&
3276            !vlan_hw_offload_capable(features, skb->vlan_proto))
3277                skb = __vlan_hwaccel_push_inside(skb);
3278        return skb;
3279}
3280
3281int skb_csum_hwoffload_help(struct sk_buff *skb,
3282                            const netdev_features_t features)
3283{
3284        if (unlikely(skb->csum_not_inet))
3285                return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3286                        skb_crc32c_csum_help(skb);
3287
3288        return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3289}
3290EXPORT_SYMBOL(skb_csum_hwoffload_help);
3291
3292static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3293{
3294        netdev_features_t features;
3295
3296        features = netif_skb_features(skb);
3297        skb = validate_xmit_vlan(skb, features);
3298        if (unlikely(!skb))
3299                goto out_null;
3300
3301        skb = sk_validate_xmit_skb(skb, dev);
3302        if (unlikely(!skb))
3303                goto out_null;
3304
3305        if (netif_needs_gso(skb, features)) {
3306                struct sk_buff *segs;
3307
3308                segs = skb_gso_segment(skb, features);
3309                if (IS_ERR(segs)) {
3310                        goto out_kfree_skb;
3311                } else if (segs) {
3312                        consume_skb(skb);
3313                        skb = segs;
3314                }
3315        } else {
3316                if (skb_needs_linearize(skb, features) &&
3317                    __skb_linearize(skb))
3318                        goto out_kfree_skb;
3319
3320                /* If packet is not checksummed and device does not
3321                 * support checksumming for this protocol, complete
3322                 * checksumming here.
3323                 */
3324                if (skb->ip_summed == CHECKSUM_PARTIAL) {
3325                        if (skb->encapsulation)
3326                                skb_set_inner_transport_header(skb,
3327                                                               skb_checksum_start_offset(skb));
3328                        else
3329                                skb_set_transport_header(skb,
3330                                                         skb_checksum_start_offset(skb));
3331                        if (skb_csum_hwoffload_help(skb, features))
3332                                goto out_kfree_skb;
3333                }
3334        }
3335
3336        skb = validate_xmit_xfrm(skb, features, again);
3337
3338        return skb;
3339
3340out_kfree_skb:
3341        kfree_skb(skb);
3342out_null:
3343        atomic_long_inc(&dev->tx_dropped);
3344        return NULL;
3345}
3346
3347struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3348{
3349        struct sk_buff *next, *head = NULL, *tail;
3350
3351        for (; skb != NULL; skb = next) {
3352                next = skb->next;
3353                skb->next = NULL;
3354
3355                /* in case skb wont be segmented, point to itself */
3356                skb->prev = skb;
3357
3358                skb = validate_xmit_skb(skb, dev, again);
3359                if (!skb)
3360                        continue;
3361
3362                if (!head)
3363                        head = skb;
3364                else
3365                        tail->next = skb;
3366                /* If skb was segmented, skb->prev points to
3367                 * the last segment. If not, it still contains skb.
3368                 */
3369                tail = skb->prev;
3370        }
3371        return head;
3372}
3373EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3374
3375static void qdisc_pkt_len_init(struct sk_buff *skb)
3376{
3377        const struct skb_shared_info *shinfo = skb_shinfo(skb);
3378
3379        qdisc_skb_cb(skb)->pkt_len = skb->len;
3380
3381        /* To get more precise estimation of bytes sent on wire,
3382         * we add to pkt_len the headers size of all segments
3383         */
3384        if (shinfo->gso_size)  {
3385                unsigned int hdr_len;
3386                u16 gso_segs = shinfo->gso_segs;
3387
3388                /* mac layer + network layer */
3389                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3390
3391                /* + transport layer */
3392                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3393                        const struct tcphdr *th;
3394                        struct tcphdr _tcphdr;
3395
3396                        th = skb_header_pointer(skb, skb_transport_offset(skb),
3397                                                sizeof(_tcphdr), &_tcphdr);
3398                        if (likely(th))
3399                                hdr_len += __tcp_hdrlen(th);
3400                } else {
3401                        struct udphdr _udphdr;
3402
3403                        if (skb_header_pointer(skb, skb_transport_offset(skb),
3404                                               sizeof(_udphdr), &_udphdr))
3405                                hdr_len += sizeof(struct udphdr);
3406                }
3407
3408                if (shinfo->gso_type & SKB_GSO_DODGY)
3409                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3410                                                shinfo->gso_size);
3411
3412                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3413        }
3414}
3415
3416static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3417                                 struct net_device *dev,
3418                                 struct netdev_queue *txq)
3419{
3420        spinlock_t *root_lock = qdisc_lock(q);
3421        struct sk_buff *to_free = NULL;
3422        bool contended;
3423        int rc;
3424
3425        qdisc_calculate_pkt_len(skb, q);
3426
3427        if (q->flags & TCQ_F_NOLOCK) {
3428                if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3429                        __qdisc_drop(skb, &to_free);
3430                        rc = NET_XMIT_DROP;
3431                } else {
3432                        rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3433                        qdisc_run(q);
3434                }
3435
3436                if (unlikely(to_free))
3437                        kfree_skb_list(to_free);
3438                return rc;
3439        }
3440
3441        /*
3442         * Heuristic to force contended enqueues to serialize on a
3443         * separate lock before trying to get qdisc main lock.
3444         * This permits qdisc->running owner to get the lock more
3445         * often and dequeue packets faster.
3446         */
3447        contended = qdisc_is_running(q);
3448        if (unlikely(contended))
3449                spin_lock(&q->busylock);
3450
3451        spin_lock(root_lock);
3452        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3453                __qdisc_drop(skb, &to_free);
3454                rc = NET_XMIT_DROP;
3455        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3456                   qdisc_run_begin(q)) {
3457                /*
3458                 * This is a work-conserving queue; there are no old skbs
3459                 * waiting to be sent out; and the qdisc is not running -
3460                 * xmit the skb directly.
3461                 */
3462
3463                qdisc_bstats_update(q, skb);
3464
3465                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3466                        if (unlikely(contended)) {
3467                                spin_unlock(&q->busylock);
3468                                contended = false;
3469                        }
3470                        __qdisc_run(q);
3471                }
3472
3473                qdisc_run_end(q);
3474                rc = NET_XMIT_SUCCESS;
3475        } else {
3476                rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3477                if (qdisc_run_begin(q)) {
3478                        if (unlikely(contended)) {
3479                                spin_unlock(&q->busylock);
3480                                contended = false;
3481                        }
3482                        __qdisc_run(q);
3483                        qdisc_run_end(q);
3484                }
3485        }
3486        spin_unlock(root_lock);
3487        if (unlikely(to_free))
3488                kfree_skb_list(to_free);
3489        if (unlikely(contended))
3490                spin_unlock(&q->busylock);
3491        return rc;
3492}
3493
3494#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3495static void skb_update_prio(struct sk_buff *skb)
3496{
3497        const struct netprio_map *map;
3498        const struct sock *sk;
3499        unsigned int prioidx;
3500
3501        if (skb->priority)
3502                return;
3503        map = rcu_dereference_bh(skb->dev->priomap);
3504        if (!map)
3505                return;
3506        sk = skb_to_full_sk(skb);
3507        if (!sk)
3508                return;
3509
3510        prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3511
3512        if (prioidx < map->priomap_len)
3513                skb->priority = map->priomap[prioidx];
3514}
3515#else
3516#define skb_update_prio(skb)
3517#endif
3518
3519DEFINE_PER_CPU(int, xmit_recursion);
3520EXPORT_SYMBOL(xmit_recursion);
3521
3522/**
3523 *      dev_loopback_xmit - loop back @skb
3524 *      @net: network namespace this loopback is happening in
3525 *      @sk:  sk needed to be a netfilter okfn
3526 *      @skb: buffer to transmit
3527 */
3528int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3529{
3530        skb_reset_mac_header(skb);
3531        __skb_pull(skb, skb_network_offset(skb));
3532        skb->pkt_type = PACKET_LOOPBACK;
3533        skb->ip_summed = CHECKSUM_UNNECESSARY;
3534        WARN_ON(!skb_dst(skb));
3535        skb_dst_force(skb);
3536        netif_rx_ni(skb);
3537        return 0;
3538}
3539EXPORT_SYMBOL(dev_loopback_xmit);
3540
3541#ifdef CONFIG_NET_EGRESS
3542static struct sk_buff *
3543sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3544{
3545        struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3546        struct tcf_result cl_res;
3547
3548        if (!miniq)
3549                return skb;
3550
3551        /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3552        mini_qdisc_bstats_cpu_update(miniq, skb);
3553
3554        switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3555        case TC_ACT_OK:
3556        case TC_ACT_RECLASSIFY:
3557                skb->tc_index = TC_H_MIN(cl_res.classid);
3558                break;
3559        case TC_ACT_SHOT:
3560                mini_qdisc_qstats_cpu_drop(miniq);
3561                *ret = NET_XMIT_DROP;
3562                kfree_skb(skb);
3563                return NULL;
3564        case TC_ACT_STOLEN:
3565        case TC_ACT_QUEUED:
3566        case TC_ACT_TRAP:
3567                *ret = NET_XMIT_SUCCESS;
3568                consume_skb(skb);
3569                return NULL;
3570        case TC_ACT_REDIRECT:
3571                /* No need to push/pop skb's mac_header here on egress! */
3572                skb_do_redirect(skb);
3573                *ret = NET_XMIT_SUCCESS;
3574                return NULL;
3575        default:
3576                break;
3577        }
3578
3579        return skb;
3580}
3581#endif /* CONFIG_NET_EGRESS */
3582
3583#ifdef CONFIG_XPS
3584static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3585                               struct xps_dev_maps *dev_maps, unsigned int tci)
3586{
3587        struct xps_map *map;
3588        int queue_index = -1;
3589
3590        if (dev->num_tc) {
3591                tci *= dev->num_tc;
3592                tci += netdev_get_prio_tc_map(dev, skb->priority);
3593        }
3594
3595        map = rcu_dereference(dev_maps->attr_map[tci]);
3596        if (map) {
3597                if (map->len == 1)
3598                        queue_index = map->queues[0];
3599                else
3600                        queue_index = map->queues[reciprocal_scale(
3601                                                skb_get_hash(skb), map->len)];
3602                if (unlikely(queue_index >= dev->real_num_tx_queues))
3603                        queue_index = -1;
3604        }
3605        return queue_index;
3606}
3607#endif
3608
3609static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3610                         struct sk_buff *skb)
3611{
3612#ifdef CONFIG_XPS
3613        struct xps_dev_maps *dev_maps;
3614        struct sock *sk = skb->sk;
3615        int queue_index = -1;
3616
3617        if (!static_key_false(&xps_needed))
3618                return -1;
3619
3620        rcu_read_lock();
3621        if (!static_key_false(&xps_rxqs_needed))
3622                goto get_cpus_map;
3623
3624        dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
3625        if (dev_maps) {
3626                int tci = sk_rx_queue_get(sk);
3627
3628                if (tci >= 0 && tci < dev->num_rx_queues)
3629                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3630                                                          tci);
3631        }
3632
3633get_cpus_map:
3634        if (queue_index < 0) {
3635                dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
3636                if (dev_maps) {
3637                        unsigned int tci = skb->sender_cpu - 1;
3638
3639                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3640                                                          tci);
3641                }
3642        }
3643        rcu_read_unlock();
3644
3645        return queue_index;
3646#else
3647        return -1;
3648#endif
3649}
3650
3651u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
3652                     struct net_device *sb_dev,
3653                     select_queue_fallback_t fallback)
3654{
3655        return 0;
3656}
3657EXPORT_SYMBOL(dev_pick_tx_zero);
3658
3659u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
3660                       struct net_device *sb_dev,
3661                       select_queue_fallback_t fallback)
3662{
3663        return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
3664}
3665EXPORT_SYMBOL(dev_pick_tx_cpu_id);
3666
3667static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
3668                            struct net_device *sb_dev)
3669{
3670        struct sock *sk = skb->sk;
3671        int queue_index = sk_tx_queue_get(sk);
3672
3673        sb_dev = sb_dev ? : dev;
3674
3675        if (queue_index < 0 || skb->ooo_okay ||
3676            queue_index >= dev->real_num_tx_queues) {
3677                int new_index = get_xps_queue(dev, sb_dev, skb);
3678
3679                if (new_index < 0)
3680                        new_index = skb_tx_hash(dev, sb_dev, skb);
3681
3682                if (queue_index != new_index && sk &&
3683                    sk_fullsock(sk) &&
3684                    rcu_access_pointer(sk->sk_dst_cache))
3685                        sk_tx_queue_set(sk, new_index);
3686
3687                queue_index = new_index;
3688        }
3689
3690        return queue_index;
3691}
3692
3693struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3694                                    struct sk_buff *skb,
3695                                    struct net_device *sb_dev)
3696{
3697        int queue_index = 0;
3698
3699#ifdef CONFIG_XPS
3700        u32 sender_cpu = skb->sender_cpu - 1;
3701
3702        if (sender_cpu >= (u32)NR_CPUS)
3703                skb->sender_cpu = raw_smp_processor_id() + 1;
3704#endif
3705
3706        if (dev->real_num_tx_queues != 1) {
3707                const struct net_device_ops *ops = dev->netdev_ops;
3708
3709                if (ops->ndo_select_queue)
3710                        queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
3711                                                            __netdev_pick_tx);
3712                else
3713                        queue_index = __netdev_pick_tx(dev, skb, sb_dev);
3714
3715                queue_index = netdev_cap_txqueue(dev, queue_index);
3716        }
3717
3718        skb_set_queue_mapping(skb, queue_index);
3719        return netdev_get_tx_queue(dev, queue_index);
3720}
3721
3722/**
3723 *      __dev_queue_xmit - transmit a buffer
3724 *      @skb: buffer to transmit
3725 *      @sb_dev: suboordinate device used for L2 forwarding offload
3726 *
3727 *      Queue a buffer for transmission to a network device. The caller must
3728 *      have set the device and priority and built the buffer before calling
3729 *      this function. The function can be called from an interrupt.
3730 *
3731 *      A negative errno code is returned on a failure. A success does not
3732 *      guarantee the frame will be transmitted as it may be dropped due
3733 *      to congestion or traffic shaping.
3734 *
3735 * -----------------------------------------------------------------------------------
3736 *      I notice this method can also return errors from the queue disciplines,
3737 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3738 *      be positive.
3739 *
3740 *      Regardless of the return value, the skb is consumed, so it is currently
3741 *      difficult to retry a send to this method.  (You can bump the ref count
3742 *      before sending to hold a reference for retry if you are careful.)
3743 *
3744 *      When calling this method, interrupts MUST be enabled.  This is because
3745 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3746 *          --BLG
3747 */
3748static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
3749{
3750        struct net_device *dev = skb->dev;
3751        struct netdev_queue *txq;
3752        struct Qdisc *q;
3753        int rc = -ENOMEM;
3754        bool again = false;
3755
3756        skb_reset_mac_header(skb);
3757
3758        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3759                __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3760
3761        /* Disable soft irqs for various locks below. Also
3762         * stops preemption for RCU.
3763         */
3764        rcu_read_lock_bh();
3765
3766        skb_update_prio(skb);
3767
3768        qdisc_pkt_len_init(skb);
3769#ifdef CONFIG_NET_CLS_ACT
3770        skb->tc_at_ingress = 0;
3771# ifdef CONFIG_NET_EGRESS
3772        if (static_branch_unlikely(&egress_needed_key)) {
3773                skb = sch_handle_egress(skb, &rc, dev);
3774                if (!skb)
3775                        goto out;
3776        }
3777# endif
3778#endif
3779        /* If device/qdisc don't need skb->dst, release it right now while
3780         * its hot in this cpu cache.
3781         */
3782        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3783                skb_dst_drop(skb);
3784        else
3785                skb_dst_force(skb);
3786
3787        txq = netdev_pick_tx(dev, skb, sb_dev);
3788        q = rcu_dereference_bh(txq->qdisc);
3789
3790        trace_net_dev_queue(skb);
3791        if (q->enqueue) {
3792                rc = __dev_xmit_skb(skb, q, dev, txq);
3793                goto out;
3794        }
3795
3796        /* The device has no queue. Common case for software devices:
3797         * loopback, all the sorts of tunnels...
3798
3799         * Really, it is unlikely that netif_tx_lock protection is necessary
3800         * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3801         * counters.)
3802         * However, it is possible, that they rely on protection
3803         * made by us here.
3804
3805         * Check this and shot the lock. It is not prone from deadlocks.
3806         *Either shot noqueue qdisc, it is even simpler 8)
3807         */
3808        if (dev->flags & IFF_UP) {
3809                int cpu = smp_processor_id(); /* ok because BHs are off */
3810
3811                if (txq->xmit_lock_owner != cpu) {
3812                        if (unlikely(__this_cpu_read(xmit_recursion) >
3813                                     XMIT_RECURSION_LIMIT))
3814                                goto recursion_alert;
3815
3816                        skb = validate_xmit_skb(skb, dev, &again);
3817                        if (!skb)
3818                                goto out;
3819
3820                        HARD_TX_LOCK(dev, txq, cpu);
3821
3822                        if (!netif_xmit_stopped(txq)) {
3823                                __this_cpu_inc(xmit_recursion);
3824                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3825                                __this_cpu_dec(xmit_recursion);
3826                                if (dev_xmit_complete(rc)) {
3827                                        HARD_TX_UNLOCK(dev, txq);
3828                                        goto out;
3829                                }
3830                        }
3831                        HARD_TX_UNLOCK(dev, txq);
3832                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3833                                             dev->name);
3834                } else {
3835                        /* Recursion is detected! It is possible,
3836                         * unfortunately
3837                         */
3838recursion_alert:
3839                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3840                                             dev->name);
3841                }
3842        }
3843
3844        rc = -ENETDOWN;
3845        rcu_read_unlock_bh();
3846
3847        atomic_long_inc(&dev->tx_dropped);
3848        kfree_skb_list(skb);
3849        return rc;
3850out:
3851        rcu_read_unlock_bh();
3852        return rc;
3853}
3854
3855int dev_queue_xmit(struct sk_buff *skb)
3856{
3857        return __dev_queue_xmit(skb, NULL);
3858}
3859EXPORT_SYMBOL(dev_queue_xmit);
3860
3861int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
3862{
3863        return __dev_queue_xmit(skb, sb_dev);
3864}
3865EXPORT_SYMBOL(dev_queue_xmit_accel);
3866
3867int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
3868{
3869        struct net_device *dev = skb->dev;
3870        struct sk_buff *orig_skb = skb;
3871        struct netdev_queue *txq;
3872        int ret = NETDEV_TX_BUSY;
3873        bool again = false;
3874
3875        if (unlikely(!netif_running(dev) ||
3876                     !netif_carrier_ok(dev)))
3877                goto drop;
3878
3879        skb = validate_xmit_skb_list(skb, dev, &again);
3880        if (skb != orig_skb)
3881                goto drop;
3882
3883        skb_set_queue_mapping(skb, queue_id);
3884        txq = skb_get_tx_queue(dev, skb);
3885
3886        local_bh_disable();
3887
3888        HARD_TX_LOCK(dev, txq, smp_processor_id());
3889        if (!netif_xmit_frozen_or_drv_stopped(txq))
3890                ret = netdev_start_xmit(skb, dev, txq, false);
3891        HARD_TX_UNLOCK(dev, txq);
3892
3893        local_bh_enable();
3894
3895        if (!dev_xmit_complete(ret))
3896                kfree_skb(skb);
3897
3898        return ret;
3899drop:
3900        atomic_long_inc(&dev->tx_dropped);
3901        kfree_skb_list(skb);
3902        return NET_XMIT_DROP;
3903}
3904EXPORT_SYMBOL(dev_direct_xmit);
3905
3906/*************************************************************************
3907 *                      Receiver routines
3908 *************************************************************************/
3909
3910int netdev_max_backlog __read_mostly = 1000;
3911EXPORT_SYMBOL(netdev_max_backlog);
3912
3913int netdev_tstamp_prequeue __read_mostly = 1;
3914int netdev_budget __read_mostly = 300;
3915unsigned int __read_mostly netdev_budget_usecs = 2000;
3916int weight_p __read_mostly = 64;           /* old backlog weight */
3917int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
3918int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
3919int dev_rx_weight __read_mostly = 64;
3920int dev_tx_weight __read_mostly = 64;
3921
3922/* Called with irq disabled */
3923static inline void ____napi_schedule(struct softnet_data *sd,
3924                                     struct napi_struct *napi)
3925{
3926        list_add_tail(&napi->poll_list, &sd->poll_list);
3927        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3928}
3929
3930#ifdef CONFIG_RPS
3931
3932/* One global table that all flow-based protocols share. */
3933struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3934EXPORT_SYMBOL(rps_sock_flow_table);
3935u32 rps_cpu_mask __read_mostly;
3936EXPORT_SYMBOL(rps_cpu_mask);
3937
3938struct static_key rps_needed __read_mostly;
3939EXPORT_SYMBOL(rps_needed);
3940struct static_key rfs_needed __read_mostly;
3941EXPORT_SYMBOL(rfs_needed);
3942
3943static struct rps_dev_flow *
3944set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3945            struct rps_dev_flow *rflow, u16 next_cpu)
3946{
3947        if (next_cpu < nr_cpu_ids) {
3948#ifdef CONFIG_RFS_ACCEL
3949                struct netdev_rx_queue *rxqueue;
3950                struct rps_dev_flow_table *flow_table;
3951                struct rps_dev_flow *old_rflow;
3952                u32 flow_id;
3953                u16 rxq_index;
3954                int rc;
3955
3956                /* Should we steer this flow to a different hardware queue? */
3957                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3958                    !(dev->features & NETIF_F_NTUPLE))
3959                        goto out;
3960                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3961                if (rxq_index == skb_get_rx_queue(skb))
3962                        goto out;
3963
3964                rxqueue = dev->_rx + rxq_index;
3965                flow_table = rcu_dereference(rxqueue->rps_flow_table);
3966                if (!flow_table)
3967                        goto out;
3968                flow_id = skb_get_hash(skb) & flow_table->mask;
3969                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3970                                                        rxq_index, flow_id);
3971                if (rc < 0)
3972                        goto out;
3973                old_rflow = rflow;
3974                rflow = &flow_table->flows[flow_id];
3975                rflow->filter = rc;
3976                if (old_rflow->filter == rflow->filter)
3977                        old_rflow->filter = RPS_NO_FILTER;
3978        out:
3979#endif
3980                rflow->last_qtail =
3981                        per_cpu(softnet_data, next_cpu).input_queue_head;
3982        }
3983
3984        rflow->cpu = next_cpu;
3985        return rflow;
3986}
3987
3988/*
3989 * get_rps_cpu is called from netif_receive_skb and returns the target
3990 * CPU from the RPS map of the receiving queue for a given skb.
3991 * rcu_read_lock must be held on entry.
3992 */
3993static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3994                       struct rps_dev_flow **rflowp)
3995{
3996        const struct rps_sock_flow_table *sock_flow_table;
3997        struct netdev_rx_queue *rxqueue = dev->_rx;
3998        struct rps_dev_flow_table *flow_table;
3999        struct rps_map *map;
4000        int cpu = -1;

4001        u32 tcpu;
4002        u32 hash;
4003
4004        if (skb_rx_queue_recorded(skb)) {
4005                u16 index = skb_get_rx_queue(skb);
4006
4007                if (unlikely(index >= dev->real_num_rx_queues)) {
4008                        WARN_ONCE(dev->real_num_rx_queues > 1,
4009                                  "%s received packet on queue %u, but number "
4010                                  "of RX queues is %u\n",
4011                                  dev->name, index, dev->real_num_rx_queues);
4012                        goto done;
4013                }
4014                rxqueue += index;
4015        }
4016
4017        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4018
4019        flow_table = rcu_dereference(rxqueue->rps_flow_table);
4020        map = rcu_dereference(rxqueue->rps_map);
4021        if (!flow_table && !map)
4022                goto done;
4023
4024        skb_reset_network_header(skb);
4025        hash = skb_get_hash(skb);
4026        if (!hash)
4027                goto done;
4028
4029        sock_flow_table = rcu_dereference(rps_sock_flow_table);
4030        if (flow_table && sock_flow_table) {
4031                struct rps_dev_flow *rflow;
4032                u32 next_cpu;
4033                u32 ident;
4034
4035                /* First check into global flow table if there is a match */
4036                ident = sock_flow_table->ents[hash & sock_flow_table->mask];
4037                if ((ident ^ hash) & ~rps_cpu_mask)
4038                        goto try_rps;
4039
4040                next_cpu = ident & rps_cpu_mask;
4041
4042                /* OK, now we know there is a match,
4043                 * we can look at the local (per receive queue) flow table
4044                 */
4045                rflow = &flow_table->flows[hash & flow_table->mask];
4046                tcpu = rflow->cpu;
4047
4048                /*
4049                 * If the desired CPU (where last recvmsg was done) is
4050                 * different from current CPU (one in the rx-queue flow
4051                 * table entry), switch if one of the following holds:
4052                 *   - Current CPU is unset (>= nr_cpu_ids).
4053                 *   - Current CPU is offline.
4054                 *   - The current CPU's queue tail has advanced beyond the
4055                 *     last packet that was enqueued using this table entry.
4056                 *     This guarantees that all previous packets for the flow
4057                 *     have been dequeued, thus preserving in order delivery.
4058                 */
4059                if (unlikely(tcpu != next_cpu) &&
4060                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4061                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4062                      rflow->last_qtail)) >= 0)) {
4063                        tcpu = next_cpu;
4064                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4065                }
4066
4067                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4068                        *rflowp = rflow;
4069                        cpu = tcpu;
4070                        goto done;
4071                }
4072        }
4073
4074try_rps:
4075
4076        if (map) {
4077                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4078                if (cpu_online(tcpu)) {
4079                        cpu = tcpu;
4080                        goto done;
4081                }
4082        }
4083
4084done:
4085        return cpu;
4086}
4087
4088#ifdef CONFIG_RFS_ACCEL
4089
4090/**
4091 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4092 * @dev: Device on which the filter was set
4093 * @rxq_index: RX queue index
4094 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4095 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4096 *
4097 * Drivers that implement ndo_rx_flow_steer() should periodically call
4098 * this function for each installed filter and remove the filters for
4099 * which it returns %true.
4100 */
4101bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4102                         u32 flow_id, u16 filter_id)
4103{
4104        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4105        struct rps_dev_flow_table *flow_table;
4106        struct rps_dev_flow *rflow;
4107        bool expire = true;
4108        unsigned int cpu;
4109
4110        rcu_read_lock();
4111        flow_table = rcu_dereference(rxqueue->rps_flow_table);
4112        if (flow_table && flow_id <= flow_table->mask) {
4113                rflow = &flow_table->flows[flow_id];
4114                cpu = READ_ONCE(rflow->cpu);
4115                if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4116                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4117                           rflow->last_qtail) <
4118                     (int)(10 * flow_table->mask)))
4119                        expire = false;
4120        }
4121        rcu_read_unlock();
4122        return expire;
4123}
4124EXPORT_SYMBOL(rps_may_expire_flow);
4125
4126#endif /* CONFIG_RFS_ACCEL */
4127
4128/* Called from hardirq (IPI) context */
4129static void rps_trigger_softirq(void *data)
4130{
4131        struct softnet_data *sd = data;
4132
4133        ____napi_schedule(sd, &sd->backlog);
4134        sd->received_rps++;
4135}
4136
4137#endif /* CONFIG_RPS */
4138
4139/*
4140 * Check if this softnet_data structure is another cpu one
4141 * If yes, queue it to our IPI list and return 1
4142 * If no, return 0
4143 */
4144static int rps_ipi_queued(struct softnet_data *sd)
4145{
4146#ifdef CONFIG_RPS
4147        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4148
4149        if (sd != mysd) {
4150                sd->rps_ipi_next = mysd->rps_ipi_list;
4151                mysd->rps_ipi_list = sd;
4152
4153                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4154                return 1;
4155        }
4156#endif /* CONFIG_RPS */
4157        return 0;
4158}
4159
4160#ifdef CONFIG_NET_FLOW_LIMIT
4161int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4162#endif
4163
4164static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4165{
4166#ifdef CONFIG_NET_FLOW_LIMIT
4167        struct sd_flow_limit *fl;
4168        struct softnet_data *sd;
4169        unsigned int old_flow, new_flow;
4170
4171        if (qlen < (netdev_max_backlog >> 1))
4172                return false;
4173
4174        sd = this_cpu_ptr(&softnet_data);
4175
4176        rcu_read_lock();
4177        fl = rcu_dereference(sd->flow_limit);
4178        if (fl) {
4179                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4180                old_flow = fl->history[fl->history_head];
4181                fl->history[fl->history_head] = new_flow;
4182
4183                fl->history_head++;
4184                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4185
4186                if (likely(fl->buckets[old_flow]))
4187                        fl->buckets[old_flow]--;
4188
4189                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4190                        fl->count++;
4191                        rcu_read_unlock();
4192                        return true;
4193                }
4194        }
4195        rcu_read_unlock();
4196#endif
4197        return false;
4198}
4199
4200/*
4201 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4202 * queue (may be a remote CPU queue).
4203 */
4204static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4205                              unsigned int *qtail)
4206{
4207        struct softnet_data *sd;
4208        unsigned long flags;
4209        unsigned int qlen;
4210
4211        sd = &per_cpu(softnet_data, cpu);
4212
4213        local_irq_save(flags);
4214
4215        rps_lock(sd);
4216        if (!netif_running(skb->dev))
4217                goto drop;
4218        qlen = skb_queue_len(&sd->input_pkt_queue);
4219        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
4220                if (qlen) {
4221enqueue:
4222                        __skb_queue_tail(&sd->input_pkt_queue, skb);
4223                        input_queue_tail_incr_save(sd, qtail);
4224                        rps_unlock(sd);
4225                        local_irq_restore(flags);
4226                        return NET_RX_SUCCESS;
4227                }
4228
4229                /* Schedule NAPI for backlog device
4230                 * We can use non atomic operation since we own the queue lock
4231                 */
4232                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
4233                        if (!rps_ipi_queued(sd))
4234                                ____napi_schedule(sd, &sd->backlog);
4235                }
4236                goto enqueue;
4237        }
4238
4239drop:
4240        sd->dropped++;
4241        rps_unlock(sd);
4242
4243        local_irq_restore(flags);
4244
4245        atomic_long_inc(&skb->dev->rx_dropped);
4246        kfree_skb(skb);
4247        return NET_RX_DROP;
4248}
4249
4250static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4251{
4252        struct net_device *dev = skb->dev;
4253        struct netdev_rx_queue *rxqueue;
4254
4255        rxqueue = dev->_rx;
4256
4257        if (skb_rx_queue_recorded(skb)) {
4258                u16 index = skb_get_rx_queue(skb);
4259
4260                if (unlikely(index >= dev->real_num_rx_queues)) {
4261                        WARN_ONCE(dev->real_num_rx_queues > 1,
4262                                  "%s received packet on queue %u, but number "
4263                                  "of RX queues is %u\n",
4264                                  dev->name, index, dev->real_num_rx_queues);
4265
4266                        return rxqueue; /* Return first rxqueue */
4267                }
4268                rxqueue += index;
4269        }
4270        return rxqueue;
4271}
4272
4273static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4274                                     struct xdp_buff *xdp,
4275                                     struct bpf_prog *xdp_prog)
4276{
4277        struct netdev_rx_queue *rxqueue;
4278        void *orig_data, *orig_data_end;
4279        u32 metalen, act = XDP_DROP;
4280        int hlen, off;
4281        u32 mac_len;
4282
4283        /* Reinjected packets coming from act_mirred or similar should
4284         * not get XDP generic processing.
4285         */
4286        if (skb_cloned(skb) || skb_is_tc_redirected(skb))
4287                return XDP_PASS;
4288
4289        /* XDP packets must be linear and must have sufficient headroom
4290         * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4291         * native XDP provides, thus we need to do it here as well.
4292         */
4293        if (skb_is_nonlinear(skb) ||
4294            skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4295                int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4296                int troom = skb->tail + skb->data_len - skb->end;
4297
4298                /* In case we have to go down the path and also linearize,
4299                 * then lets do the pskb_expand_head() work just once here.
4300                 */
4301                if (pskb_expand_head(skb,
4302                                     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4303                                     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4304                        goto do_drop;
4305                if (skb_linearize(skb))
4306                        goto do_drop;
4307        }
4308
4309        /* The XDP program wants to see the packet starting at the MAC
4310         * header.
4311         */
4312        mac_len = skb->data - skb_mac_header(skb);
4313        hlen = skb_headlen(skb) + mac_len;
4314        xdp->data = skb->data - mac_len;
4315        xdp->data_meta = xdp->data;
4316        xdp->data_end = xdp->data + hlen;
4317        xdp->data_hard_start = skb->data - skb_headroom(skb);
4318        orig_data_end = xdp->data_end;
4319        orig_data = xdp->data;
4320
4321        rxqueue = netif_get_rxqueue(skb);
4322        xdp->rxq = &rxqueue->xdp_rxq;
4323
4324        act = bpf_prog_run_xdp(xdp_prog, xdp);
4325
4326        off = xdp->data - orig_data;
4327        if (off > 0)
4328                __skb_pull(skb, off);
4329        else if (off < 0)
4330                __skb_push(skb, -off);
4331        skb->mac_header += off;
4332
4333        /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
4334         * pckt.
4335         */
4336        off = orig_data_end - xdp->data_end;
4337        if (off != 0) {
4338                skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4339                skb->len -= off;
4340
4341        }
4342
4343        switch (act) {
4344        case XDP_REDIRECT:
4345        case XDP_TX:
4346                __skb_push(skb, mac_len);
4347                break;
4348        case XDP_PASS:
4349                metalen = xdp->data - xdp->data_meta;
4350                if (metalen)
4351                        skb_metadata_set(skb, metalen);
4352                break;
4353        default:
4354                bpf_warn_invalid_xdp_action(act);
4355                /* fall through */
4356        case XDP_ABORTED:
4357                trace_xdp_exception(skb->dev, xdp_prog, act);
4358                /* fall through */
4359        case XDP_DROP:
4360        do_drop:
4361                kfree_skb(skb);
4362                break;
4363        }
4364
4365        return act;
4366}
4367
4368/* When doing generic XDP we have to bypass the qdisc layer and the
4369 * network taps in order to match in-driver-XDP behavior.
4370 */
4371void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4372{
4373        struct net_device *dev = skb->dev;
4374        struct netdev_queue *txq;
4375        bool free_skb = true;
4376        int cpu, rc;
4377
4378        txq = netdev_pick_tx(dev, skb, NULL);
4379        cpu = smp_processor_id();
4380        HARD_TX_LOCK(dev, txq, cpu);
4381        if (!netif_xmit_stopped(txq)) {
4382                rc = netdev_start_xmit(skb, dev, txq, 0);
4383                if (dev_xmit_complete(rc))
4384                        free_skb = false;
4385        }
4386        HARD_TX_UNLOCK(dev, txq);
4387        if (free_skb) {
4388                trace_xdp_exception(dev, xdp_prog, XDP_TX);
4389                kfree_skb(skb);
4390        }
4391}
4392EXPORT_SYMBOL_GPL(generic_xdp_tx);
4393
4394static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4395
4396int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4397{
4398        if (xdp_prog) {
4399                struct xdp_buff xdp;
4400                u32 act;
4401                int err;
4402
4403                act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4404                if (act != XDP_PASS) {
4405                        switch (act) {
4406                        case XDP_REDIRECT:
4407                                err = xdp_do_generic_redirect(skb->dev, skb,
4408                                                              &xdp, xdp_prog);
4409                                if (err)
4410                                        goto out_redir;
4411                                break;
4412                        case XDP_TX:
4413                                generic_xdp_tx(skb, xdp_prog);
4414                                break;
4415                        }
4416                        return XDP_DROP;
4417                }
4418        }
4419        return XDP_PASS;
4420out_redir:
4421        kfree_skb(skb);
4422        return XDP_DROP;
4423}
4424EXPORT_SYMBOL_GPL(do_xdp_generic);
4425
4426static int netif_rx_internal(struct sk_buff *skb)
4427{
4428        int ret;
4429
4430        net_timestamp_check(netdev_tstamp_prequeue, skb);
4431
4432        trace_netif_rx(skb);
4433
4434        if (static_branch_unlikely(&generic_xdp_needed_key)) {
4435                int ret;
4436
4437                preempt_disable();
4438                rcu_read_lock();
4439                ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
4440                rcu_read_unlock();
4441                preempt_enable();
4442
4443                /* Consider XDP consuming the packet a success from
4444                 * the netdev point of view we do not want to count
4445                 * this as an error.
4446                 */
4447                if (ret != XDP_PASS)
4448                        return NET_RX_SUCCESS;
4449        }
4450
4451#ifdef CONFIG_RPS
4452        if (static_key_false(&rps_needed)) {
4453                struct rps_dev_flow voidflow, *rflow = &voidflow;
4454                int cpu;
4455
4456                preempt_disable();
4457                rcu_read_lock();
4458
4459                cpu = get_rps_cpu(skb->dev, skb, &rflow);
4460                if (cpu < 0)
4461                        cpu = smp_processor_id();
4462
4463                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4464
4465                rcu_read_unlock();
4466                preempt_enable();
4467        } else
4468#endif
4469        {
4470                unsigned int qtail;
4471
4472                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4473                put_cpu();
4474        }
4475        return ret;
4476}
4477
4478/**
4479 *      netif_rx        -       post buffer to the network code
4480 *      @skb: buffer to post
4481 *
4482 *      This function receives a packet from a device driver and queues it for
4483 *      the upper (protocol) levels to process.  It always succeeds. The buffer
4484 *      may be dropped during processing for congestion control or by the
4485 *      protocol layers.
4486 *
4487 *      return values:
4488 *      NET_RX_SUCCESS  (no congestion)
4489 *      NET_RX_DROP     (packet was dropped)
4490 *
4491 */
4492
4493int netif_rx(struct sk_buff *skb)
4494{
4495        trace_netif_rx_entry(skb);
4496
4497        return netif_rx_internal(skb);
4498}
4499EXPORT_SYMBOL(netif_rx);
4500
4501int netif_rx_ni(struct sk_buff *skb)
4502{
4503        int err;
4504
4505        trace_netif_rx_ni_entry(skb);
4506
4507        preempt_disable();
4508        err = netif_rx_internal(skb);
4509        if (local_softirq_pending())
4510                do_softirq();
4511        preempt_enable();
4512
4513        return err;
4514}
4515EXPORT_SYMBOL(netif_rx_ni);
4516
4517static __latent_entropy void net_tx_action(struct softirq_action *h)
4518{
4519        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4520
4521        if (sd->completion_queue) {
4522                struct sk_buff *clist;
4523
4524                local_irq_disable();
4525                clist = sd->completion_queue;
4526                sd->completion_queue = NULL;
4527                local_irq_enable();
4528
4529                while (clist) {
4530                        struct sk_buff *skb = clist;
4531
4532                        clist = clist->next;
4533
4534                        WARN_ON(refcount_read(&skb->users));
4535                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4536                                trace_consume_skb(skb);
4537                        else
4538                                trace_kfree_skb(skb, net_tx_action);
4539
4540                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4541                                __kfree_skb(skb);
4542                        else
4543                                __kfree_skb_defer(skb);
4544                }
4545
4546                __kfree_skb_flush();
4547        }
4548
4549        if (sd->output_queue) {
4550                struct Qdisc *head;
4551
4552                local_irq_disable();
4553                head = sd->output_queue;
4554                sd->output_queue = NULL;
4555                sd->output_queue_tailp = &sd->output_queue;
4556                local_irq_enable();
4557
4558                while (head) {
4559                        struct Qdisc *q = head;
4560                        spinlock_t *root_lock = NULL;
4561
4562                        head = head->next_sched;
4563
4564                        if (!(q->flags & TCQ_F_NOLOCK)) {
4565                                root_lock = qdisc_lock(q);
4566                                spin_lock(root_lock);
4567                        }
4568                        /* We need to make sure head->next_sched is read
4569                         * before clearing __QDISC_STATE_SCHED
4570                         */
4571                        smp_mb__before_atomic();
4572                        clear_bit(__QDISC_STATE_SCHED, &q->state);
4573                        qdisc_run(q);
4574                        if (root_lock)
4575                                spin_unlock(root_lock);
4576                }
4577        }
4578
4579        xfrm_dev_backlog(sd);
4580}
4581
4582#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4583/* This hook is defined here for ATM LANE */
4584int (*br_fdb_test_addr_hook)(struct net_device *dev,
4585                             unsigned char *addr) __read_mostly;
4586EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4587#endif
4588
4589static inline struct sk_buff *
4590sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4591                   struct net_device *orig_dev)
4592{
4593#ifdef CONFIG_NET_CLS_ACT
4594        struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4595        struct tcf_result cl_res;
4596
4597        /* If there's at least one ingress present somewhere (so
4598         * we get here via enabled static key), remaining devices
4599         * that are not configured with an ingress qdisc will bail
4600         * out here.
4601         */
4602        if (!miniq)
4603                return skb;
4604
4605        if (*pt_prev) {
4606                *ret = deliver_skb(skb, *pt_prev, orig_dev);
4607                *pt_prev = NULL;
4608        }
4609
4610        qdisc_skb_cb(skb)->pkt_len = skb->len;
4611        skb->tc_at_ingress = 1;
4612        mini_qdisc_bstats_cpu_update(miniq, skb);
4613
4614        switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
4615        case TC_ACT_OK:
4616        case TC_ACT_RECLASSIFY:
4617                skb->tc_index = TC_H_MIN(cl_res.classid);
4618                break;
4619        case TC_ACT_SHOT:
4620                mini_qdisc_qstats_cpu_drop(miniq);
4621                kfree_skb(skb);
4622                return NULL;
4623        case TC_ACT_STOLEN:
4624        case TC_ACT_QUEUED:
4625        case TC_ACT_TRAP:
4626                consume_skb(skb);
4627                return NULL;
4628        case TC_ACT_REDIRECT:
4629                /* skb_mac_header check was done by cls/act_bpf, so
4630                 * we can safely push the L2 header back before
4631                 * redirecting to another netdev
4632                 */
4633                __skb_push(skb, skb->mac_len);
4634                skb_do_redirect(skb);
4635                return NULL;
4636        case TC_ACT_REINSERT:
4637                /* this does not scrub the packet, and updates stats on error */
4638                skb_tc_reinsert(skb, &cl_res);
4639                return NULL;
4640        default:
4641                break;
4642        }
4643#endif /* CONFIG_NET_CLS_ACT */
4644        return skb;
4645}
4646
4647/**
4648 *      netdev_is_rx_handler_busy - check if receive handler is registered
4649 *      @dev: device to check
4650 *
4651 *      Check if a receive handler is already registered for a given device.
4652 *      Return true if there one.
4653 *
4654 *      The caller must hold the rtnl_mutex.
4655 */
4656bool netdev_is_rx_handler_busy(struct net_device *dev)
4657{
4658        ASSERT_RTNL();
4659        return dev && rtnl_dereference(dev->rx_handler);
4660}
4661EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4662
4663/**
4664 *      netdev_rx_handler_register - register receive handler
4665 *      @dev: device to register a handler for
4666 *      @rx_handler: receive handler to register
4667 *      @rx_handler_data: data pointer that is used by rx handler
4668 *
4669 *      Register a receive handler for a device. This handler will then be
4670 *      called from __netif_receive_skb. A negative errno code is returned
4671 *      on a failure.
4672 *
4673 *      The caller must hold the rtnl_mutex.
4674 *
4675 *      For a general description of rx_handler, see enum rx_handler_result.
4676 */
4677int netdev_rx_handler_register(struct net_device *dev,
4678                               rx_handler_func_t *rx_handler,
4679                               void *rx_handler_data)
4680{
4681        if (netdev_is_rx_handler_busy(dev))
4682                return -EBUSY;
4683
4684        if (dev->priv_flags & IFF_NO_RX_HANDLER)
4685                return -EINVAL;
4686
4687        /* Note: rx_handler_data must be set before rx_handler */
4688        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4689        rcu_assign_pointer(dev->rx_handler, rx_handler);
4690
4691        return 0;
4692}
4693EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4694
4695/**
4696 *      netdev_rx_handler_unregister - unregister receive handler
4697 *      @dev: device to unregister a handler from
4698 *
4699 *      Unregister a receive handler from a device.
4700 *
4701 *      The caller must hold the rtnl_mutex.
4702 */
4703void netdev_rx_handler_unregister(struct net_device *dev)
4704{
4705
4706        ASSERT_RTNL();
4707        RCU_INIT_POINTER(dev->rx_handler, NULL);
4708        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4709         * section has a guarantee to see a non NULL rx_handler_data
4710         * as well.
4711         */
4712        synchronize_net();
4713        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4714}
4715EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4716
4717/*
4718 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4719 * the special handling of PFMEMALLOC skbs.
4720 */
4721static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4722{
4723        switch (skb->protocol) {
4724        case htons(ETH_P_ARP):
4725        case htons(ETH_P_IP):
4726        case htons(ETH_P_IPV6):
4727        case htons(ETH_P_8021Q):
4728        case htons(ETH_P_8021AD):
4729                return true;
4730        default:
4731                return false;
4732        }
4733}
4734
4735static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4736                             int *ret, struct net_device *orig_dev)
4737{
4738#ifdef CONFIG_NETFILTER_INGRESS
4739        if (nf_hook_ingress_active(skb)) {
4740                int ingress_retval;
4741
4742                if (*pt_prev) {
4743                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
4744                        *pt_prev = NULL;
4745                }
4746
4747                rcu_read_lock();
4748                ingress_retval = nf_hook_ingress(skb);
4749                rcu_read_unlock();
4750                return ingress_retval;
4751        }
4752#endif /* CONFIG_NETFILTER_INGRESS */
4753        return 0;
4754}
4755
4756static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
4757                                    struct packet_type **ppt_prev)
4758{
4759        struct packet_type *ptype, *pt_prev;
4760        rx_handler_func_t *rx_handler;
4761        struct net_device *orig_dev;
4762        bool deliver_exact = false;
4763        int ret = NET_RX_DROP;
4764        __be16 type;
4765
4766        net_timestamp_check(!netdev_tstamp_prequeue, skb);
4767
4768        trace_netif_receive_skb(skb);
4769
4770        orig_dev = skb->dev;
4771
4772        skb_reset_network_header(skb);
4773        if (!skb_transport_header_was_set(skb))
4774                skb_reset_transport_header(skb);
4775        skb_reset_mac_len(skb);
4776
4777        pt_prev = NULL;
4778
4779another_round:
4780        skb->skb_iif = skb->dev->ifindex;
4781
4782        __this_cpu_inc(softnet_data.processed);
4783
4784        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4785            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4786                skb = skb_vlan_untag(skb);
4787                if (unlikely(!skb))
4788                        goto out;
4789        }
4790
4791        if (skb_skip_tc_classify(skb))
4792                goto skip_classify;
4793
4794        if (pfmemalloc)
4795                goto skip_taps;
4796
4797        list_for_each_entry_rcu(ptype, &ptype_all, list) {
4798                if (pt_prev)
4799                        ret = deliver_skb(skb, pt_prev, orig_dev);
4800                pt_prev = ptype;
4801        }
4802
4803        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4804                if (pt_prev)
4805                        ret = deliver_skb(skb, pt_prev, orig_dev);
4806                pt_prev = ptype;
4807        }
4808
4809skip_taps:
4810#ifdef CONFIG_NET_INGRESS
4811        if (static_branch_unlikely(&ingress_needed_key)) {
4812                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4813                if (!skb)
4814                        goto out;
4815
4816                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4817                        goto out;
4818        }
4819#endif
4820        skb_reset_tc(skb);
4821skip_classify:
4822        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4823                goto drop;
4824
4825        if (skb_vlan_tag_present(skb)) {
4826                if (pt_prev) {
4827                        ret = deliver_skb(skb, pt_prev, orig_dev);
4828                        pt_prev = NULL;
4829                }
4830                if (vlan_do_receive(&skb))
4831                        goto another_round;
4832                else if (unlikely(!skb))
4833                        goto out;
4834        }
4835
4836        rx_handler = rcu_dereference(skb->dev->rx_handler);
4837        if (rx_handler) {
4838                if (pt_prev) {
4839                        ret = deliver_skb(skb, pt_prev, orig_dev);
4840                        pt_prev = NULL;
4841                }
4842                switch (rx_handler(&skb)) {
4843                case RX_HANDLER_CONSUMED:
4844                        ret = NET_RX_SUCCESS;
4845                        goto out;
4846                case RX_HANDLER_ANOTHER:
4847                        goto another_round;
4848                case RX_HANDLER_EXACT:
4849                        deliver_exact = true;
4850                case RX_HANDLER_PASS:
4851                        break;
4852                default:
4853                        BUG();
4854                }
4855        }
4856
4857        if (unlikely(skb_vlan_tag_present(skb))) {
4858                if (skb_vlan_tag_get_id(skb))
4859                        skb->pkt_type = PACKET_OTHERHOST;
4860                /* Note: we might in the future use prio bits
4861                 * and set skb->priority like in vlan_do_receive()
4862                 * For the time being, just ignore Priority Code Point
4863                 */
4864                skb->vlan_tci = 0;
4865        }
4866
4867        type = skb->protocol;
4868
4869        /* deliver only exact match when indicated */
4870        if (likely(!deliver_exact)) {
4871                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4872                                       &ptype_base[ntohs(type) &
4873                                                   PTYPE_HASH_MASK]);
4874        }
4875
4876        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4877                               &orig_dev->ptype_specific);
4878
4879        if (unlikely(skb->dev != orig_dev)) {
4880                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4881                                       &skb->dev->ptype_specific);
4882        }
4883
4884        if (pt_prev) {
4885                if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
4886                        goto drop;
4887                *ppt_prev = pt_prev;
4888        } else {
4889drop:
4890                if (!deliver_exact)
4891                        atomic_long_inc(&skb->dev->rx_dropped);
4892                else
4893                        atomic_long_inc(&skb->dev->rx_nohandler);
4894                kfree_skb(skb);
4895                /* Jamal, now you will not able to escape explaining
4896                 * me how you were going to use this. :-)
4897                 */
4898                ret = NET_RX_DROP;
4899        }
4900
4901out:
4902        return ret;
4903}
4904
4905static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
4906{
4907        struct net_device *orig_dev = skb->dev;
4908        struct packet_type *pt_prev = NULL;
4909        int ret;
4910
4911        ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
4912        if (pt_prev)
4913                ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4914        return ret;
4915}
4916
4917/**
4918 *      netif_receive_skb_core - special purpose version of netif_receive_skb
4919 *      @skb: buffer to process
4920 *
4921 *      More direct receive version of netif_receive_skb().  It should
4922 *      only be used by callers that have a need to skip RPS and Generic XDP.
4923 *      Caller must also take care of handling if (page_is_)pfmemalloc.
4924 *
4925 *      This function may only be called from softirq context and interrupts
4926 *      should be enabled.
4927 *
4928 *      Return values (usually ignored):
4929 *      NET_RX_SUCCESS: no congestion
4930 *      NET_RX_DROP: packet was dropped
4931 */
4932int netif_receive_skb_core(struct sk_buff *skb)
4933{
4934        int ret;
4935
4936        rcu_read_lock();
4937        ret = __netif_receive_skb_one_core(skb, false);
4938        rcu_read_unlock();
4939
4940        return ret;
4941}
4942EXPORT_SYMBOL(netif_receive_skb_core);
4943
4944static inline void __netif_receive_skb_list_ptype(struct list_head *head,
4945                                                  struct packet_type *pt_prev,
4946                                                  struct net_device *orig_dev)
4947{
4948        struct sk_buff *skb, *next;
4949
4950        if (!pt_prev)
4951                return;
4952        if (list_empty(head))
4953                return;
4954        if (pt_prev->list_func != NULL)
4955                pt_prev->list_func(head, pt_prev, orig_dev);
4956        else
4957                list_for_each_entry_safe(skb, next, head, list)
4958                        pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4959}
4960
4961static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
4962{
4963        /* Fast-path assumptions:
4964         * - There is no RX handler.
4965         * - Only one packet_type matches.
4966         * If either of these fails, we will end up doing some per-packet
4967         * processing in-line, then handling the 'last ptype' for the whole
4968         * sublist.  This can't cause out-of-order delivery to any single ptype,
4969         * because the 'last ptype' must be constant across the sublist, and all
4970         * other ptypes are handled per-packet.
4971         */
4972        /* Current (common) ptype of sublist */
4973        struct packet_type *pt_curr = NULL;
4974        /* Current (common) orig_dev of sublist */
4975        struct net_device *od_curr = NULL;
4976        struct list_head sublist;
4977        struct sk_buff *skb, *next;
4978
4979        INIT_LIST_HEAD(&sublist);
4980        list_for_each_entry_safe(skb, next, head, list) {
4981                struct net_device *orig_dev = skb->dev;
4982                struct packet_type *pt_prev = NULL;
4983
4984                list_del(&skb->list);
4985                __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
4986                if (!pt_prev)
4987                        continue;
4988                if (pt_curr != pt_prev || od_curr != orig_dev) {
4989                        /* dispatch old sublist */
4990                        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
4991                        /* start new sublist */
4992                        INIT_LIST_HEAD(&sublist);
4993                        pt_curr = pt_prev;
4994                        od_curr = orig_dev;
4995                }
4996                list_add_tail(&skb->list, &sublist);
4997        }
4998
4999        /* dispatch final sublist */
5000        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);

5001}
5002
5003static int __netif_receive_skb(struct sk_buff *skb)
5004{
5005        int ret;
5006
5007        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5008                unsigned int noreclaim_flag;
5009
5010                /*
5011                 * PFMEMALLOC skbs are special, they should
5012                 * - be delivered to SOCK_MEMALLOC sockets only
5013                 * - stay away from userspace
5014                 * - have bounded memory usage
5015                 *
5016                 * Use PF_MEMALLOC as this saves us from propagating the allocation
5017                 * context down to all allocation sites.
5018                 */
5019                noreclaim_flag = memalloc_noreclaim_save();
5020                ret = __netif_receive_skb_one_core(skb, true);
5021                memalloc_noreclaim_restore(noreclaim_flag);
5022        } else
5023                ret = __netif_receive_skb_one_core(skb, false);
5024
5025        return ret;
5026}
5027
5028static void __netif_receive_skb_list(struct list_head *head)
5029{
5030        unsigned long noreclaim_flag = 0;
5031        struct sk_buff *skb, *next;
5032        bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5033
5034        list_for_each_entry_safe(skb, next, head, list) {
5035                if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5036                        struct list_head sublist;
5037
5038                        /* Handle the previous sublist */
5039                        list_cut_before(&sublist, head, &skb->list);
5040                        if (!list_empty(&sublist))
5041                                __netif_receive_skb_list_core(&sublist, pfmemalloc);
5042                        pfmemalloc = !pfmemalloc;
5043                        /* See comments in __netif_receive_skb */
5044                        if (pfmemalloc)
5045                                noreclaim_flag = memalloc_noreclaim_save();
5046                        else
5047                                memalloc_noreclaim_restore(noreclaim_flag);
5048                }
5049        }
5050        /* Handle the remaining sublist */
5051        if (!list_empty(head))
5052                __netif_receive_skb_list_core(head, pfmemalloc);
5053        /* Restore pflags */
5054        if (pfmemalloc)
5055                memalloc_noreclaim_restore(noreclaim_flag);
5056}
5057
5058static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5059{
5060        struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5061        struct bpf_prog *new = xdp->prog;
5062        int ret = 0;
5063
5064        switch (xdp->command) {
5065        case XDP_SETUP_PROG:
5066                rcu_assign_pointer(dev->xdp_prog, new);
5067                if (old)
5068                        bpf_prog_put(old);
5069
5070                if (old && !new) {
5071                        static_branch_dec(&generic_xdp_needed_key);
5072                } else if (new && !old) {
5073                        static_branch_inc(&generic_xdp_needed_key);
5074                        dev_disable_lro(dev);
5075                        dev_disable_gro_hw(dev);
5076                }
5077                break;
5078
5079        case XDP_QUERY_PROG:
5080                xdp->prog_id = old ? old->aux->id : 0;
5081                break;
5082
5083        default:
5084                ret = -EINVAL;
5085                break;
5086        }
5087
5088        return ret;
5089}
5090
5091static int netif_receive_skb_internal(struct sk_buff *skb)
5092{
5093        int ret;
5094
5095        net_timestamp_check(netdev_tstamp_prequeue, skb);
5096
5097        if (skb_defer_rx_timestamp(skb))
5098                return NET_RX_SUCCESS;
5099
5100        if (static_branch_unlikely(&generic_xdp_needed_key)) {
5101                int ret;
5102
5103                preempt_disable();
5104                rcu_read_lock();
5105                ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
5106                rcu_read_unlock();
5107                preempt_enable();
5108
5109                if (ret != XDP_PASS)
5110                        return NET_RX_DROP;
5111        }
5112
5113        rcu_read_lock();
5114#ifdef CONFIG_RPS
5115        if (static_key_false(&rps_needed)) {
5116                struct rps_dev_flow voidflow, *rflow = &voidflow;
5117                int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5118
5119                if (cpu >= 0) {
5120                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5121                        rcu_read_unlock();
5122                        return ret;
5123                }
5124        }
5125#endif
5126        ret = __netif_receive_skb(skb);
5127        rcu_read_unlock();
5128        return ret;
5129}
5130
5131static void netif_receive_skb_list_internal(struct list_head *head)
5132{
5133        struct bpf_prog *xdp_prog = NULL;
5134        struct sk_buff *skb, *next;
5135        struct list_head sublist;
5136
5137        INIT_LIST_HEAD(&sublist);
5138        list_for_each_entry_safe(skb, next, head, list) {
5139                net_timestamp_check(netdev_tstamp_prequeue, skb);
5140                list_del(&skb->list);
5141                if (!skb_defer_rx_timestamp(skb))
5142                        list_add_tail(&skb->list, &sublist);
5143        }
5144        list_splice_init(&sublist, head);
5145
5146        if (static_branch_unlikely(&generic_xdp_needed_key)) {
5147                preempt_disable();
5148                rcu_read_lock();
5149                list_for_each_entry_safe(skb, next, head, list) {
5150                        xdp_prog = rcu_dereference(skb->dev->xdp_prog);
5151                        list_del(&skb->list);
5152                        if (do_xdp_generic(xdp_prog, skb) == XDP_PASS)
5153                                list_add_tail(&skb->list, &sublist);
5154                }
5155                rcu_read_unlock();
5156                preempt_enable();
5157                /* Put passed packets back on main list */
5158                list_splice_init(&sublist, head);
5159        }
5160
5161        rcu_read_lock();
5162#ifdef CONFIG_RPS
5163        if (static_key_false(&rps_needed)) {
5164                list_for_each_entry_safe(skb, next, head, list) {
5165                        struct rps_dev_flow voidflow, *rflow = &voidflow;
5166                        int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5167
5168                        if (cpu >= 0) {
5169                                /* Will be handled, remove from list */
5170                                list_del(&skb->list);
5171                                enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5172                        }
5173                }
5174        }
5175#endif
5176        __netif_receive_skb_list(head);
5177        rcu_read_unlock();
5178}
5179
5180/**
5181 *      netif_receive_skb - process receive buffer from network
5182 *      @skb: buffer to process
5183 *
5184 *      netif_receive_skb() is the main receive data processing function.
5185 *      It always succeeds. The buffer may be dropped during processing
5186 *      for congestion control or by the protocol layers.
5187 *
5188 *      This function may only be called from softirq context and interrupts
5189 *      should be enabled.
5190 *
5191 *      Return values (usually ignored):
5192 *      NET_RX_SUCCESS: no congestion
5193 *      NET_RX_DROP: packet was dropped
5194 */
5195int netif_receive_skb(struct sk_buff *skb)
5196{
5197        trace_netif_receive_skb_entry(skb);
5198
5199        return netif_receive_skb_internal(skb);
5200}
5201EXPORT_SYMBOL(netif_receive_skb);
5202
5203/**
5204 *      netif_receive_skb_list - process many receive buffers from network
5205 *      @head: list of skbs to process.
5206 *
5207 *      Since return value of netif_receive_skb() is normally ignored, and
5208 *      wouldn't be meaningful for a list, this function returns void.
5209 *
5210 *      This function may only be called from softirq context and interrupts
5211 *      should be enabled.
5212 */
5213void netif_receive_skb_list(struct list_head *head)
5214{
5215        struct sk_buff *skb;
5216
5217        if (list_empty(head))
5218                return;
5219        list_for_each_entry(skb, head, list)
5220                trace_netif_receive_skb_list_entry(skb);
5221        netif_receive_skb_list_internal(head);
5222}
5223EXPORT_SYMBOL(netif_receive_skb_list);
5224
5225DEFINE_PER_CPU(struct work_struct, flush_works);
5226
5227/* Network device is going away, flush any packets still pending */
5228static void flush_backlog(struct work_struct *work)
5229{
5230        struct sk_buff *skb, *tmp;
5231        struct softnet_data *sd;
5232
5233        local_bh_disable();
5234        sd = this_cpu_ptr(&softnet_data);
5235
5236        local_irq_disable();
5237        rps_lock(sd);
5238        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5239                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5240                        __skb_unlink(skb, &sd->input_pkt_queue);
5241                        kfree_skb(skb);
5242                        input_queue_head_incr(sd);
5243                }
5244        }
5245        rps_unlock(sd);
5246        local_irq_enable();
5247
5248        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5249                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5250                        __skb_unlink(skb, &sd->process_queue);
5251                        kfree_skb(skb);
5252                        input_queue_head_incr(sd);
5253                }
5254        }
5255        local_bh_enable();
5256}
5257
5258static void flush_all_backlogs(void)
5259{
5260        unsigned int cpu;
5261
5262        get_online_cpus();
5263
5264        for_each_online_cpu(cpu)
5265                queue_work_on(cpu, system_highpri_wq,
5266                              per_cpu_ptr(&flush_works, cpu));
5267
5268        for_each_online_cpu(cpu)
5269                flush_work(per_cpu_ptr(&flush_works, cpu));
5270
5271        put_online_cpus();
5272}
5273
5274static int napi_gro_complete(struct sk_buff *skb)
5275{
5276        struct packet_offload *ptype;
5277        __be16 type = skb->protocol;
5278        struct list_head *head = &offload_base;
5279        int err = -ENOENT;
5280
5281        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
5282
5283        if (NAPI_GRO_CB(skb)->count == 1) {
5284                skb_shinfo(skb)->gso_size = 0;
5285                goto out;
5286        }
5287
5288        rcu_read_lock();
5289        list_for_each_entry_rcu(ptype, head, list) {
5290                if (ptype->type != type || !ptype->callbacks.gro_complete)
5291                        continue;
5292
5293                err = ptype->callbacks.gro_complete(skb, 0);
5294                break;
5295        }
5296        rcu_read_unlock();
5297
5298        if (err) {
5299                WARN_ON(&ptype->list == head);
5300                kfree_skb(skb);
5301                return NET_RX_SUCCESS;
5302        }
5303
5304out:
5305        return netif_receive_skb_internal(skb);
5306}
5307
5308static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
5309                                   bool flush_old)
5310{
5311        struct list_head *head = &napi->gro_hash[index].list;
5312        struct sk_buff *skb, *p;
5313
5314        list_for_each_entry_safe_reverse(skb, p, head, list) {
5315                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
5316                        return;
5317                list_del(&skb->list);
5318                skb->next = NULL;
5319                napi_gro_complete(skb);
5320                napi->gro_hash[index].count--;
5321        }
5322
5323        if (!napi->gro_hash[index].count)
5324                __clear_bit(index, &napi->gro_bitmask);
5325}
5326
5327/* napi->gro_hash[].list contains packets ordered by age.
5328 * youngest packets at the head of it.
5329 * Complete skbs in reverse order to reduce latencies.
5330 */
5331void napi_gro_flush(struct napi_struct *napi, bool flush_old)
5332{
5333        u32 i;
5334
5335        for (i = 0; i < GRO_HASH_BUCKETS; i++) {
5336                if (test_bit(i, &napi->gro_bitmask))
5337                        __napi_gro_flush_chain(napi, i, flush_old);
5338        }
5339}
5340EXPORT_SYMBOL(napi_gro_flush);
5341
5342static struct list_head *gro_list_prepare(struct napi_struct *napi,
5343                                          struct sk_buff *skb)
5344{
5345        unsigned int maclen = skb->dev->hard_header_len;
5346        u32 hash = skb_get_hash_raw(skb);
5347        struct list_head *head;
5348        struct sk_buff *p;
5349
5350        head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
5351        list_for_each_entry(p, head, list) {
5352                unsigned long diffs;
5353
5354                NAPI_GRO_CB(p)->flush = 0;
5355
5356                if (hash != skb_get_hash_raw(p)) {
5357                        NAPI_GRO_CB(p)->same_flow = 0;
5358                        continue;
5359                }
5360
5361                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5362                diffs |= p->vlan_tci ^ skb->vlan_tci;
5363                diffs |= skb_metadata_dst_cmp(p, skb);
5364                diffs |= skb_metadata_differs(p, skb);
5365                if (maclen == ETH_HLEN)
5366                        diffs |= compare_ether_header(skb_mac_header(p),
5367                                                      skb_mac_header(skb));
5368                else if (!diffs)
5369                        diffs = memcmp(skb_mac_header(p),
5370                                       skb_mac_header(skb),
5371                                       maclen);
5372                NAPI_GRO_CB(p)->same_flow = !diffs;
5373        }
5374
5375        return head;
5376}
5377
5378static void skb_gro_reset_offset(struct sk_buff *skb)
5379{
5380        const struct skb_shared_info *pinfo = skb_shinfo(skb);
5381        const skb_frag_t *frag0 = &pinfo->frags[0];
5382
5383        NAPI_GRO_CB(skb)->data_offset = 0;
5384        NAPI_GRO_CB(skb)->frag0 = NULL;
5385        NAPI_GRO_CB(skb)->frag0_len = 0;
5386
5387        if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
5388            pinfo->nr_frags &&
5389            !PageHighMem(skb_frag_page(frag0))) {
5390                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
5391                NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
5392                                                    skb_frag_size(frag0),
5393                                                    skb->end - skb->tail);
5394        }
5395}
5396
5397static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
5398{
5399        struct skb_shared_info *pinfo = skb_shinfo(skb);
5400
5401        BUG_ON(skb->end - skb->tail < grow);
5402
5403        memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
5404
5405        skb->data_len -= grow;
5406        skb->tail += grow;
5407
5408        pinfo->frags[0].page_offset += grow;
5409        skb_frag_size_sub(&pinfo->frags[0], grow);
5410
5411        if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
5412                skb_frag_unref(skb, 0);
5413                memmove(pinfo->frags, pinfo->frags + 1,
5414                        --pinfo->nr_frags * sizeof(pinfo->frags[0]));
5415        }
5416}
5417
5418static void gro_flush_oldest(struct list_head *head)
5419{
5420        struct sk_buff *oldest;
5421
5422        oldest = list_last_entry(head, struct sk_buff, list);
5423
5424        /* We are called with head length >= MAX_GRO_SKBS, so this is
5425         * impossible.
5426         */
5427        if (WARN_ON_ONCE(!oldest))
5428                return;
5429
5430        /* Do not adjust napi->gro_hash[].count, caller is adding a new
5431         * SKB to the chain.
5432         */
5433        list_del(&oldest->list);
5434        napi_gro_complete(oldest);
5435}
5436
5437static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5438{
5439        u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
5440        struct list_head *head = &offload_base;
5441        struct packet_offload *ptype;
5442        __be16 type = skb->protocol;
5443        struct list_head *gro_head;
5444        struct sk_buff *pp = NULL;
5445        enum gro_result ret;
5446        int same_flow;
5447        int grow;
5448
5449        if (netif_elide_gro(skb->dev))
5450                goto normal;
5451
5452        gro_head = gro_list_prepare(napi, skb);
5453
5454        rcu_read_lock();
5455        list_for_each_entry_rcu(ptype, head, list) {
5456                if (ptype->type != type || !ptype->callbacks.gro_receive)
5457                        continue;
5458
5459                skb_set_network_header(skb, skb_gro_offset(skb));
5460                skb_reset_mac_len(skb);
5461                NAPI_GRO_CB(skb)->same_flow = 0;
5462                NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
5463                NAPI_GRO_CB(skb)->free = 0;
5464                NAPI_GRO_CB(skb)->encap_mark = 0;
5465                NAPI_GRO_CB(skb)->recursion_counter = 0;
5466                NAPI_GRO_CB(skb)->is_fou = 0;
5467                NAPI_GRO_CB(skb)->is_atomic = 1;
5468                NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
5469
5470                /* Setup for GRO checksum validation */
5471                switch (skb->ip_summed) {
5472                case CHECKSUM_COMPLETE:
5473                        NAPI_GRO_CB(skb)->csum = skb->csum;
5474                        NAPI_GRO_CB(skb)->csum_valid = 1;
5475                        NAPI_GRO_CB(skb)->csum_cnt = 0;
5476                        break;
5477                case CHECKSUM_UNNECESSARY:
5478                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
5479                        NAPI_GRO_CB(skb)->csum_valid = 0;
5480                        break;
5481                default:
5482                        NAPI_GRO_CB(skb)->csum_cnt = 0;
5483                        NAPI_GRO_CB(skb)->csum_valid = 0;
5484                }
5485
5486                pp = ptype->callbacks.gro_receive(gro_head, skb);
5487                break;
5488        }
5489        rcu_read_unlock();
5490
5491        if (&ptype->list == head)
5492                goto normal;
5493
5494        if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
5495                ret = GRO_CONSUMED;
5496                goto ok;
5497        }
5498
5499        same_flow = NAPI_GRO_CB(skb)->same_flow;
5500        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
5501
5502        if (pp) {
5503                list_del(&pp->list);
5504                pp->next = NULL;
5505                napi_gro_complete(pp);
5506                napi->gro_hash[hash].count--;
5507        }
5508
5509        if (same_flow)
5510                goto ok;
5511
5512        if (NAPI_GRO_CB(skb)->flush)
5513                goto normal;
5514
5515        if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
5516                gro_flush_oldest(gro_head);
5517        } else {
5518                napi->gro_hash[hash].count++;
5519        }
5520        NAPI_GRO_CB(skb)->count = 1;
5521        NAPI_GRO_CB(skb)->age = jiffies;
5522        NAPI_GRO_CB(skb)->last = skb;
5523        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
5524        list_add(&skb->list, gro_head);
5525        ret = GRO_HELD;
5526
5527pull:
5528        grow = skb_gro_offset(skb) - skb_headlen(skb);
5529        if (grow > 0)
5530                gro_pull_from_frag0(skb, grow);
5531ok:
5532        if (napi->gro_hash[hash].count) {
5533                if (!test_bit(hash, &napi->gro_bitmask))
5534                        __set_bit(hash, &napi->gro_bitmask);
5535        } else if (test_bit(hash, &napi->gro_bitmask)) {
5536                __clear_bit(hash, &napi->gro_bitmask);
5537        }
5538
5539        return ret;
5540
5541normal:
5542        ret = GRO_NORMAL;
5543        goto pull;
5544}
5545
5546struct packet_offload *gro_find_receive_by_type(__be16 type)
5547{
5548        struct list_head *offload_head = &offload_base;
5549        struct packet_offload *ptype;
5550
5551        list_for_each_entry_rcu(ptype, offload_head, list) {
5552                if (ptype->type != type || !ptype->callbacks.gro_receive)
5553                        continue;
5554                return ptype;
5555        }
5556        return NULL;
5557}
5558EXPORT_SYMBOL(gro_find_receive_by_type);
5559
5560struct packet_offload *gro_find_complete_by_type(__be16 type)
5561{
5562        struct list_head *offload_head = &offload_base;
5563        struct packet_offload *ptype;
5564
5565        list_for_each_entry_rcu(ptype, offload_head, list) {
5566                if (ptype->type != type || !ptype->callbacks.gro_complete)
5567                        continue;
5568                return ptype;
5569        }
5570        return NULL;
5571}
5572EXPORT_SYMBOL(gro_find_complete_by_type);
5573
5574static void napi_skb_free_stolen_head(struct sk_buff *skb)
5575{
5576        skb_dst_drop(skb);
5577        secpath_reset(skb);
5578        kmem_cache_free(skbuff_head_cache, skb);
5579}
5580
5581static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5582{
5583        switch (ret) {
5584        case GRO_NORMAL:
5585                if (netif_receive_skb_internal(skb))
5586                        ret = GRO_DROP;
5587                break;
5588
5589        case GRO_DROP:
5590                kfree_skb(skb);
5591                break;
5592
5593        case GRO_MERGED_FREE:
5594                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5595                        napi_skb_free_stolen_head(skb);
5596                else
5597                        __kfree_skb(skb);
5598                break;
5599
5600        case GRO_HELD:
5601        case GRO_MERGED:
5602        case GRO_CONSUMED:
5603                break;
5604        }
5605
5606        return ret;
5607}
5608
5609gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5610{
5611        skb_mark_napi_id(skb, napi);
5612        trace_napi_gro_receive_entry(skb);
5613
5614        skb_gro_reset_offset(skb);
5615
5616        return napi_skb_finish(dev_gro_receive(napi, skb), skb);
5617}
5618EXPORT_SYMBOL(napi_gro_receive);
5619
5620static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
5621{
5622        if (unlikely(skb->pfmemalloc)) {
5623                consume_skb(skb);
5624                return;
5625        }
5626        __skb_pull(skb, skb_headlen(skb));
5627        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
5628        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
5629        skb->vlan_tci = 0;
5630        skb->dev = napi->dev;
5631        skb->skb_iif = 0;
5632        skb->encapsulation = 0;
5633        skb_shinfo(skb)->gso_type = 0;
5634        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
5635        secpath_reset(skb);
5636
5637        napi->skb = skb;
5638}
5639
5640struct sk_buff *napi_get_frags(struct napi_struct *napi)
5641{
5642        struct sk_buff *skb = napi->skb;
5643
5644        if (!skb) {
5645                skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
5646                if (skb) {
5647                        napi->skb = skb;
5648                        skb_mark_napi_id(skb, napi);
5649                }
5650        }
5651        return skb;
5652}
5653EXPORT_SYMBOL(napi_get_frags);
5654
5655static gro_result_t napi_frags_finish(struct napi_struct *napi,
5656                                      struct sk_buff *skb,
5657                                      gro_result_t ret)
5658{
5659        switch (ret) {
5660        case GRO_NORMAL:
5661        case GRO_HELD:
5662                __skb_push(skb, ETH_HLEN);
5663                skb->protocol = eth_type_trans(skb, skb->dev);
5664                if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
5665                        ret = GRO_DROP;
5666                break;
5667
5668        case GRO_DROP:
5669                napi_reuse_skb(napi, skb);
5670                break;
5671
5672        case GRO_MERGED_FREE:
5673                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5674                        napi_skb_free_stolen_head(skb);
5675                else
5676                        napi_reuse_skb(napi, skb);
5677                break;
5678
5679        case GRO_MERGED:
5680        case GRO_CONSUMED:
5681                break;
5682        }
5683
5684        return ret;
5685}
5686
5687/* Upper GRO stack assumes network header starts at gro_offset=0
5688 * Drivers could call both napi_gro_frags() and napi_gro_receive()
5689 * We copy ethernet header into skb->data to have a common layout.
5690 */
5691static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
5692{
5693        struct sk_buff *skb = napi->skb;
5694        const struct ethhdr *eth;
5695        unsigned int hlen = sizeof(*eth);
5696
5697        napi->skb = NULL;
5698
5699        skb_reset_mac_header(skb);
5700        skb_gro_reset_offset(skb);
5701
5702        eth = skb_gro_header_fast(skb, 0);
5703        if (unlikely(skb_gro_header_hard(skb, hlen))) {
5704                eth = skb_gro_header_slow(skb, hlen, 0);
5705                if (unlikely(!eth)) {
5706                        net_warn_ratelimited("%s: dropping impossible skb from %s\n",
5707                                             __func__, napi->dev->name);
5708                        napi_reuse_skb(napi, skb);
5709                        return NULL;
5710                }
5711        } else {
5712                gro_pull_from_frag0(skb, hlen);
5713                NAPI_GRO_CB(skb)->frag0 += hlen;
5714                NAPI_GRO_CB(skb)->frag0_len -= hlen;
5715        }
5716        __skb_pull(skb, hlen);
5717
5718        /*
5719         * This works because the only protocols we care about don't require
5720         * special handling.
5721         * We'll fix it up properly in napi_frags_finish()
5722         */
5723        skb->protocol = eth->h_proto;
5724
5725        return skb;
5726}
5727
5728gro_result_t napi_gro_frags(struct napi_struct *napi)
5729{
5730        struct sk_buff *skb = napi_frags_skb(napi);
5731
5732        if (!skb)
5733                return GRO_DROP;
5734
5735        trace_napi_gro_frags_entry(skb);
5736
5737        return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5738}
5739EXPORT_SYMBOL(napi_gro_frags);
5740
5741/* Compute the checksum from gro_offset and return the folded value
5742 * after adding in any pseudo checksum.
5743 */
5744__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
5745{
5746        __wsum wsum;
5747        __sum16 sum;
5748
5749        wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
5750
5751        /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
5752        sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
5753        if (likely(!sum)) {
5754                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
5755                    !skb->csum_complete_sw)
5756                        netdev_rx_csum_fault(skb->dev);
5757        }
5758
5759        NAPI_GRO_CB(skb)->csum = wsum;
5760        NAPI_GRO_CB(skb)->csum_valid = 1;
5761
5762        return sum;
5763}
5764EXPORT_SYMBOL(__skb_gro_checksum_complete);
5765
5766static void net_rps_send_ipi(struct softnet_data *remsd)
5767{
5768#ifdef CONFIG_RPS
5769        while (remsd) {
5770                struct softnet_data *next = remsd->rps_ipi_next;
5771
5772                if (cpu_online(remsd->cpu))
5773                        smp_call_function_single_async(remsd->cpu, &remsd->csd);
5774                remsd = next;
5775        }
5776#endif
5777}
5778
5779/*
5780 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5781 * Note: called with local irq disabled, but exits with local irq enabled.
5782 */
5783static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5784{
5785#ifdef CONFIG_RPS
5786        struct softnet_data *remsd = sd->rps_ipi_list;
5787
5788        if (remsd) {
5789                sd->rps_ipi_list = NULL;
5790
5791                local_irq_enable();
5792
5793                /* Send pending IPI's to kick RPS processing on remote cpus. */
5794                net_rps_send_ipi(remsd);
5795        } else
5796#endif
5797                local_irq_enable();
5798}
5799
5800static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5801{
5802#ifdef CONFIG_RPS
5803        return sd->rps_ipi_list != NULL;
5804#else
5805        return false;
5806#endif
5807}
5808
5809static int process_backlog(struct napi_struct *napi, int quota)
5810{
5811        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
5812        bool again = true;
5813        int work = 0;
5814
5815        /* Check if we have pending ipi, its better to send them now,
5816         * not waiting net_rx_action() end.
5817         */
5818        if (sd_has_rps_ipi_waiting(sd)) {
5819                local_irq_disable();
5820                net_rps_action_and_irq_enable(sd);
5821        }
5822
5823        napi->weight = dev_rx_weight;
5824        while (again) {
5825                struct sk_buff *skb;
5826
5827                while ((skb = __skb_dequeue(&sd->process_queue))) {
5828                        rcu_read_lock();
5829                        __netif_receive_skb(skb);
5830                        rcu_read_unlock();
5831                        input_queue_head_incr(sd);
5832                        if (++work >= quota)
5833                                return work;
5834
5835                }
5836
5837                local_irq_disable();
5838                rps_lock(sd);
5839                if (skb_queue_empty(&sd->input_pkt_queue)) {
5840                        /*
5841                         * Inline a custom version of __napi_complete().
5842                         * only current cpu owns and manipulates this napi,
5843                         * and NAPI_STATE_SCHED is the only possible flag set
5844                         * on backlog.
5845                         * We can use a plain write instead of clear_bit(),
5846                         * and we dont need an smp_mb() memory barrier.
5847                         */
5848                        napi->state = 0;
5849                        again = false;
5850                } else {
5851                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
5852                                                   &sd->process_queue);
5853                }
5854                rps_unlock(sd);
5855                local_irq_enable();
5856        }
5857
5858        return work;
5859}
5860
5861/**
5862 * __napi_schedule - schedule for receive
5863 * @n: entry to schedule
5864 *
5865 * The entry's receive function will be scheduled to run.
5866 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
5867 */
5868void __napi_schedule(struct napi_struct *n)
5869{
5870        unsigned long flags;
5871
5872        local_irq_save(flags);
5873        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
5874        local_irq_restore(flags);
5875}
5876EXPORT_SYMBOL(__napi_schedule);
5877
5878/**
5879 *      napi_schedule_prep - check if napi can be scheduled
5880 *      @n: napi context
5881 *
5882 * Test if NAPI routine is already running, and if not mark
5883 * it as running.  This is used as a condition variable
5884 * insure only one NAPI poll instance runs.  We also make
5885 * sure there is no pending NAPI disable.
5886 */
5887bool napi_schedule_prep(struct napi_struct *n)
5888{
5889        unsigned long val, new;
5890
5891        do {
5892                val = READ_ONCE(n->state);
5893                if (unlikely(val & NAPIF_STATE_DISABLE))
5894                        return false;
5895                new = val | NAPIF_STATE_SCHED;
5896
5897                /* Sets STATE_MISSED bit if STATE_SCHED was already set
5898                 * This was suggested by Alexander Duyck, as compiler
5899                 * emits better code than :
5900                 * if (val & NAPIF_STATE_SCHED)
5901                 *     new |= NAPIF_STATE_MISSED;
5902                 */
5903                new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
5904                                                   NAPIF_STATE_MISSED;
5905        } while (cmpxchg(&n->state, val, new) != val);
5906
5907        return !(val & NAPIF_STATE_SCHED);
5908}
5909EXPORT_SYMBOL(napi_schedule_prep);
5910
5911/**
5912 * __napi_schedule_irqoff - schedule for receive
5913 * @n: entry to schedule
5914 *
5915 * Variant of __napi_schedule() assuming hard irqs are masked
5916 */
5917void __napi_schedule_irqoff(struct napi_struct *n)
5918{
5919        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
5920}
5921EXPORT_SYMBOL(__napi_schedule_irqoff);
5922
5923bool napi_complete_done(struct napi_struct *n, int work_done)
5924{
5925        unsigned long flags, val, new;
5926
5927        /*
5928         * 1) Don't let napi dequeue from the cpu poll list
5929         *    just in case its running on a different cpu.
5930         * 2) If we are busy polling, do nothing here, we have
5931         *    the guarantee we will be called later.
5932         */
5933        if (unlikely(n->state & (NAPIF_STATE_NPSVC |
5934                                 NAPIF_STATE_IN_BUSY_POLL)))
5935                return false;
5936
5937        if (n->gro_bitmask) {
5938                unsigned long timeout = 0;
5939
5940                if (work_done)
5941                        timeout = n->dev->gro_flush_timeout;
5942
5943                if (timeout)
5944                        hrtimer_start(&n->timer, ns_to_ktime(timeout),
5945                                      HRTIMER_MODE_REL_PINNED);
5946                else
5947                        napi_gro_flush(n, false);
5948        }
5949        if (unlikely(!list_empty(&n->poll_list))) {
5950                /* If n->poll_list is not empty, we need to mask irqs */
5951                local_irq_save(flags);
5952                list_del_init(&n->poll_list);
5953                local_irq_restore(flags);
5954        }
5955
5956        do {
5957                val = READ_ONCE(n->state);
5958
5959                WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5960
5961                new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
5962
5963                /* If STATE_MISSED was set, leave STATE_SCHED set,
5964                 * because we will call napi->poll() one more time.
5965                 * This C code was suggested by Alexander Duyck to help gcc.
5966                 */
5967                new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5968                                                    NAPIF_STATE_SCHED;
5969        } while (cmpxchg(&n->state, val, new) != val);
5970
5971        if (unlikely(val & NAPIF_STATE_MISSED)) {
5972                __napi_schedule(n);
5973                return false;
5974        }
5975
5976        return true;
5977}
5978EXPORT_SYMBOL(napi_complete_done);
5979
5980/* must be called under rcu_read_lock(), as we dont take a reference */
5981static struct napi_struct *napi_by_id(unsigned int napi_id)
5982{
5983        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5984        struct napi_struct *napi;
5985
5986        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5987                if (napi->napi_id == napi_id)
5988                        return napi;
5989
5990        return NULL;
5991}
5992
5993#if defined(CONFIG_NET_RX_BUSY_POLL)
5994
5995#define BUSY_POLL_BUDGET 8
5996
5997static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
5998{
5999        int rc;
6000

6001        /* Busy polling means there is a high chance device driver hard irq
6002         * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6003         * set in napi_schedule_prep().
6004         * Since we are about to call napi->poll() once more, we can safely
6005         * clear NAPI_STATE_MISSED.
6006         *
6007         * Note: x86 could use a single "lock and ..." instruction
6008         * to perform these two clear_bit()
6009         */
6010        clear_bit(NAPI_STATE_MISSED, &napi->state);
6011        clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6012
6013        local_bh_disable();
6014
6015        /* All we really want here is to re-enable device interrupts.
6016         * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6017         */
6018        rc = napi->poll(napi, BUSY_POLL_BUDGET);
6019        trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
6020        netpoll_poll_unlock(have_poll_lock);
6021        if (rc == BUSY_POLL_BUDGET)
6022                __napi_schedule(napi);
6023        local_bh_enable();
6024}
6025
6026void napi_busy_loop(unsigned int napi_id,
6027                    bool (*loop_end)(void *, unsigned long),
6028                    void *loop_end_arg)
6029{
6030        unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6031        int (*napi_poll)(struct napi_struct *napi, int budget);
6032        void *have_poll_lock = NULL;
6033        struct napi_struct *napi;
6034
6035restart:
6036        napi_poll = NULL;
6037
6038        rcu_read_lock();
6039
6040        napi = napi_by_id(napi_id);
6041        if (!napi)
6042                goto out;
6043
6044        preempt_disable();
6045        for (;;) {
6046                int work = 0;
6047
6048                local_bh_disable();
6049                if (!napi_poll) {
6050                        unsigned long val = READ_ONCE(napi->state);
6051
6052                        /* If multiple threads are competing for this napi,
6053                         * we avoid dirtying napi->state as much as we can.
6054                         */
6055                        if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6056                                   NAPIF_STATE_IN_BUSY_POLL))
6057                                goto count;
6058                        if (cmpxchg(&napi->state, val,
6059                                    val | NAPIF_STATE_IN_BUSY_POLL |
6060                                          NAPIF_STATE_SCHED) != val)
6061                                goto count;
6062                        have_poll_lock = netpoll_poll_lock(napi);
6063                        napi_poll = napi->poll;
6064                }
6065                work = napi_poll(napi, BUSY_POLL_BUDGET);
6066                trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
6067count:
6068                if (work > 0)
6069                        __NET_ADD_STATS(dev_net(napi->dev),
6070                                        LINUX_MIB_BUSYPOLLRXPACKETS, work);
6071                local_bh_enable();
6072
6073                if (!loop_end || loop_end(loop_end_arg, start_time))
6074                        break;
6075
6076                if (unlikely(need_resched())) {
6077                        if (napi_poll)
6078                                busy_poll_stop(napi, have_poll_lock);
6079                        preempt_enable();
6080                        rcu_read_unlock();
6081                        cond_resched();
6082                        if (loop_end(loop_end_arg, start_time))
6083                                return;
6084                        goto restart;
6085                }
6086                cpu_relax();
6087        }
6088        if (napi_poll)
6089                busy_poll_stop(napi, have_poll_lock);
6090        preempt_enable();
6091out:
6092        rcu_read_unlock();
6093}
6094EXPORT_SYMBOL(napi_busy_loop);
6095
6096#endif /* CONFIG_NET_RX_BUSY_POLL */
6097
6098static void napi_hash_add(struct napi_struct *napi)
6099{
6100        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
6101            test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
6102                return;
6103
6104        spin_lock(&napi_hash_lock);
6105
6106        /* 0..NR_CPUS range is reserved for sender_cpu use */
6107        do {
6108                if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6109                        napi_gen_id = MIN_NAPI_ID;
6110        } while (napi_by_id(napi_gen_id));
6111        napi->napi_id = napi_gen_id;
6112
6113        hlist_add_head_rcu(&napi->napi_hash_node,
6114                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6115
6116        spin_unlock(&napi_hash_lock);
6117}
6118
6119/* Warning : caller is responsible to make sure rcu grace period
6120 * is respected before freeing memory containing @napi
6121 */
6122bool napi_hash_del(struct napi_struct *napi)
6123{
6124        bool rcu_sync_needed = false;
6125
6126        spin_lock(&napi_hash_lock);
6127
6128        if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
6129                rcu_sync_needed = true;
6130                hlist_del_rcu(&napi->napi_hash_node);
6131        }
6132        spin_unlock(&napi_hash_lock);
6133        return rcu_sync_needed;
6134}
6135EXPORT_SYMBOL_GPL(napi_hash_del);
6136
6137static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6138{
6139        struct napi_struct *napi;
6140
6141        napi = container_of(timer, struct napi_struct, timer);
6142
6143        /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6144         * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6145         */
6146        if (napi->gro_bitmask && !napi_disable_pending(napi) &&
6147            !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
6148                __napi_schedule_irqoff(napi);
6149
6150        return HRTIMER_NORESTART;
6151}
6152
6153static void init_gro_hash(struct napi_struct *napi)
6154{
6155        int i;
6156
6157        for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6158                INIT_LIST_HEAD(&napi->gro_hash[i].list);
6159                napi->gro_hash[i].count = 0;
6160        }
6161        napi->gro_bitmask = 0;
6162}
6163
6164void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
6165                    int (*poll)(struct napi_struct *, int), int weight)
6166{
6167        INIT_LIST_HEAD(&napi->poll_list);
6168        hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6169        napi->timer.function = napi_watchdog;
6170        init_gro_hash(napi);
6171        napi->skb = NULL;
6172        napi->poll = poll;
6173        if (weight > NAPI_POLL_WEIGHT)
6174                pr_err_once("netif_napi_add() called with weight %d on device %s\n",
6175                            weight, dev->name);
6176        napi->weight = weight;
6177        list_add(&napi->dev_list, &dev->napi_list);
6178        napi->dev = dev;
6179#ifdef CONFIG_NETPOLL
6180        napi->poll_owner = -1;
6181#endif
6182        set_bit(NAPI_STATE_SCHED, &napi->state);
6183        napi_hash_add(napi);
6184}
6185EXPORT_SYMBOL(netif_napi_add);
6186
6187void napi_disable(struct napi_struct *n)
6188{
6189        might_sleep();
6190        set_bit(NAPI_STATE_DISABLE, &n->state);
6191
6192        while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
6193                msleep(1);
6194        while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
6195                msleep(1);
6196
6197        hrtimer_cancel(&n->timer);
6198
6199        clear_bit(NAPI_STATE_DISABLE, &n->state);
6200}
6201EXPORT_SYMBOL(napi_disable);
6202
6203static void flush_gro_hash(struct napi_struct *napi)
6204{
6205        int i;
6206
6207        for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6208                struct sk_buff *skb, *n;
6209
6210                list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6211                        kfree_skb(skb);
6212                napi->gro_hash[i].count = 0;
6213        }
6214}
6215
6216/* Must be called in process context */
6217void netif_napi_del(struct napi_struct *napi)
6218{
6219        might_sleep();
6220        if (napi_hash_del(napi))
6221                synchronize_net();
6222        list_del_init(&napi->dev_list);
6223        napi_free_frags(napi);
6224
6225        flush_gro_hash(napi);
6226        napi->gro_bitmask = 0;
6227}
6228EXPORT_SYMBOL(netif_napi_del);
6229
6230static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6231{
6232        void *have;
6233        int work, weight;
6234
6235        list_del_init(&n->poll_list);
6236
6237        have = netpoll_poll_lock(n);
6238
6239        weight = n->weight;
6240
6241        /* This NAPI_STATE_SCHED test is for avoiding a race
6242         * with netpoll's poll_napi().  Only the entity which
6243         * obtains the lock and sees NAPI_STATE_SCHED set will
6244         * actually make the ->poll() call.  Therefore we avoid
6245         * accidentally calling ->poll() when NAPI is not scheduled.
6246         */
6247        work = 0;
6248        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6249                work = n->poll(n, weight);
6250                trace_napi_poll(n, work, weight);
6251        }
6252
6253        WARN_ON_ONCE(work > weight);
6254
6255        if (likely(work < weight))
6256                goto out_unlock;
6257
6258        /* Drivers must not modify the NAPI state if they
6259         * consume the entire weight.  In such cases this code
6260         * still "owns" the NAPI instance and therefore can
6261         * move the instance around on the list at-will.
6262         */
6263        if (unlikely(napi_disable_pending(n))) {
6264                napi_complete(n);
6265                goto out_unlock;
6266        }
6267
6268        if (n->gro_bitmask) {
6269                /* flush too old packets
6270                 * If HZ < 1000, flush all packets.
6271                 */
6272                napi_gro_flush(n, HZ >= 1000);
6273        }
6274
6275        /* Some drivers may have called napi_schedule
6276         * prior to exhausting their budget.
6277         */
6278        if (unlikely(!list_empty(&n->poll_list))) {
6279                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6280                             n->dev ? n->dev->name : "backlog");
6281                goto out_unlock;
6282        }
6283
6284        list_add_tail(&n->poll_list, repoll);
6285
6286out_unlock:
6287        netpoll_poll_unlock(have);
6288
6289        return work;
6290}
6291
6292static __latent_entropy void net_rx_action(struct softirq_action *h)
6293{
6294        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6295        unsigned long time_limit = jiffies +
6296                usecs_to_jiffies(netdev_budget_usecs);
6297        int budget = netdev_budget;
6298        LIST_HEAD(list);
6299        LIST_HEAD(repoll);
6300
6301        local_irq_disable();
6302        list_splice_init(&sd->poll_list, &list);
6303        local_irq_enable();
6304
6305        for (;;) {
6306                struct napi_struct *n;
6307
6308                if (list_empty(&list)) {
6309                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
6310                                goto out;
6311                        break;
6312                }
6313
6314                n = list_first_entry(&list, struct napi_struct, poll_list);
6315                budget -= napi_poll(n, &repoll);
6316
6317                /* If softirq window is exhausted then punt.
6318                 * Allow this to run for 2 jiffies since which will allow
6319                 * an average latency of 1.5/HZ.
6320                 */
6321                if (unlikely(budget <= 0 ||
6322                             time_after_eq(jiffies, time_limit))) {
6323                        sd->time_squeeze++;
6324                        break;
6325                }
6326        }
6327
6328        local_irq_disable();
6329
6330        list_splice_tail_init(&sd->poll_list, &list);
6331        list_splice_tail(&repoll, &list);
6332        list_splice(&list, &sd->poll_list);
6333        if (!list_empty(&sd->poll_list))
6334                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
6335
6336        net_rps_action_and_irq_enable(sd);
6337out:
6338        __kfree_skb_flush();
6339}
6340
6341struct netdev_adjacent {
6342        struct net_device *dev;
6343
6344        /* upper master flag, there can only be one master device per list */
6345        bool master;
6346
6347        /* counter for the number of times this device was added to us */
6348        u16 ref_nr;
6349
6350        /* private field for the users */
6351        void *private;
6352
6353        struct list_head list;
6354        struct rcu_head rcu;
6355};
6356
6357static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6358                                                 struct list_head *adj_list)
6359{
6360        struct netdev_adjacent *adj;
6361
6362        list_for_each_entry(adj, adj_list, list) {
6363                if (adj->dev == adj_dev)
6364                        return adj;
6365        }
6366        return NULL;
6367}
6368
6369static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
6370{
6371        struct net_device *dev = data;
6372
6373        return upper_dev == dev;
6374}
6375
6376/**
6377 * netdev_has_upper_dev - Check if device is linked to an upper device
6378 * @dev: device
6379 * @upper_dev: upper device to check
6380 *
6381 * Find out if a device is linked to specified upper device and return true
6382 * in case it is. Note that this checks only immediate upper device,
6383 * not through a complete stack of devices. The caller must hold the RTNL lock.
6384 */
6385bool netdev_has_upper_dev(struct net_device *dev,
6386                          struct net_device *upper_dev)
6387{
6388        ASSERT_RTNL();
6389
6390        return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
6391                                             upper_dev);
6392}
6393EXPORT_SYMBOL(netdev_has_upper_dev);
6394
6395/**
6396 * netdev_has_upper_dev_all - Check if device is linked to an upper device
6397 * @dev: device
6398 * @upper_dev: upper device to check
6399 *
6400 * Find out if a device is linked to specified upper device and return true
6401 * in case it is. Note that this checks the entire upper device chain.
6402 * The caller must hold rcu lock.
6403 */
6404
6405bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6406                                  struct net_device *upper_dev)
6407{
6408        return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
6409                                               upper_dev);
6410}
6411EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6412
6413/**
6414 * netdev_has_any_upper_dev - Check if device is linked to some device
6415 * @dev: device
6416 *
6417 * Find out if a device is linked to an upper device and return true in case
6418 * it is. The caller must hold the RTNL lock.
6419 */
6420bool netdev_has_any_upper_dev(struct net_device *dev)
6421{
6422        ASSERT_RTNL();
6423
6424        return !list_empty(&dev->adj_list.upper);
6425}
6426EXPORT_SYMBOL(netdev_has_any_upper_dev);
6427
6428/**
6429 * netdev_master_upper_dev_get - Get master upper device
6430 * @dev: device
6431 *
6432 * Find a master upper device and return pointer to it or NULL in case
6433 * it's not there. The caller must hold the RTNL lock.
6434 */
6435struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6436{
6437        struct netdev_adjacent *upper;
6438
6439        ASSERT_RTNL();
6440
6441        if (list_empty(&dev->adj_list.upper))
6442                return NULL;
6443
6444        upper = list_first_entry(&dev->adj_list.upper,
6445                                 struct netdev_adjacent, list);
6446        if (likely(upper->master))
6447                return upper->dev;
6448        return NULL;
6449}
6450EXPORT_SYMBOL(netdev_master_upper_dev_get);
6451
6452/**
6453 * netdev_has_any_lower_dev - Check if device is linked to some device
6454 * @dev: device
6455 *
6456 * Find out if a device is linked to a lower device and return true in case
6457 * it is. The caller must hold the RTNL lock.
6458 */
6459static bool netdev_has_any_lower_dev(struct net_device *dev)
6460{
6461        ASSERT_RTNL();
6462
6463        return !list_empty(&dev->adj_list.lower);
6464}
6465
6466void *netdev_adjacent_get_private(struct list_head *adj_list)
6467{
6468        struct netdev_adjacent *adj;
6469
6470        adj = list_entry(adj_list, struct netdev_adjacent, list);
6471
6472        return adj->private;
6473}
6474EXPORT_SYMBOL(netdev_adjacent_get_private);
6475
6476/**
6477 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
6478 * @dev: device
6479 * @iter: list_head ** of the current position
6480 *
6481 * Gets the next device from the dev's upper list, starting from iter
6482 * position. The caller must hold RCU read lock.
6483 */
6484struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
6485                                                 struct list_head **iter)
6486{
6487        struct netdev_adjacent *upper;
6488
6489        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6490
6491        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6492
6493        if (&upper->list == &dev->adj_list.upper)
6494                return NULL;
6495
6496        *iter = &upper->list;
6497
6498        return upper->dev;
6499}
6500EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
6501
6502static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
6503                                                    struct list_head **iter)
6504{
6505        struct netdev_adjacent *upper;
6506
6507        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6508
6509        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6510
6511        if (&upper->list == &dev->adj_list.upper)
6512                return NULL;
6513
6514        *iter = &upper->list;
6515
6516        return upper->dev;
6517}
6518
6519int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
6520                                  int (*fn)(struct net_device *dev,
6521                                            void *data),
6522                                  void *data)
6523{
6524        struct net_device *udev;
6525        struct list_head *iter;
6526        int ret;
6527
6528        for (iter = &dev->adj_list.upper,
6529             udev = netdev_next_upper_dev_rcu(dev, &iter);
6530             udev;
6531             udev = netdev_next_upper_dev_rcu(dev, &iter)) {
6532                /* first is the upper device itself */
6533                ret = fn(udev, data);
6534                if (ret)
6535                        return ret;
6536
6537                /* then look at all of its upper devices */
6538                ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
6539                if (ret)
6540                        return ret;
6541        }
6542
6543        return 0;
6544}
6545EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
6546
6547/**
6548 * netdev_lower_get_next_private - Get the next ->private from the
6549 *                                 lower neighbour list
6550 * @dev: device
6551 * @iter: list_head ** of the current position
6552 *
6553 * Gets the next netdev_adjacent->private from the dev's lower neighbour
6554 * list, starting from iter position. The caller must hold either hold the
6555 * RTNL lock or its own locking that guarantees that the neighbour lower
6556 * list will remain unchanged.
6557 */
6558void *netdev_lower_get_next_private(struct net_device *dev,
6559                                    struct list_head **iter)
6560{
6561        struct netdev_adjacent *lower;
6562
6563        lower = list_entry(*iter, struct netdev_adjacent, list);
6564
6565        if (&lower->list == &dev->adj_list.lower)
6566                return NULL;
6567
6568        *iter = lower->list.next;
6569
6570        return lower->private;
6571}
6572EXPORT_SYMBOL(netdev_lower_get_next_private);
6573
6574/**
6575 * netdev_lower_get_next_private_rcu - Get the next ->private from the
6576 *                                     lower neighbour list, RCU
6577 *                                     variant
6578 * @dev: device
6579 * @iter: list_head ** of the current position
6580 *
6581 * Gets the next netdev_adjacent->private from the dev's lower neighbour
6582 * list, starting from iter position. The caller must hold RCU read lock.
6583 */
6584void *netdev_lower_get_next_private_rcu(struct net_device *dev,
6585                                        struct list_head **iter)
6586{
6587        struct netdev_adjacent *lower;
6588
6589        WARN_ON_ONCE(!rcu_read_lock_held());
6590
6591        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6592
6593        if (&lower->list == &dev->adj_list.lower)
6594                return NULL;
6595
6596        *iter = &lower->list;
6597
6598        return lower->private;
6599}
6600EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
6601
6602/**
6603 * netdev_lower_get_next - Get the next device from the lower neighbour
6604 *                         list
6605 * @dev: device
6606 * @iter: list_head ** of the current position
6607 *
6608 * Gets the next netdev_adjacent from the dev's lower neighbour
6609 * list, starting from iter position. The caller must hold RTNL lock or
6610 * its own locking that guarantees that the neighbour lower
6611 * list will remain unchanged.
6612 */
6613void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
6614{
6615        struct netdev_adjacent *lower;
6616
6617        lower = list_entry(*iter, struct netdev_adjacent, list);
6618
6619        if (&lower->list == &dev->adj_list.lower)
6620                return NULL;
6621
6622        *iter = lower->list.next;
6623
6624        return lower->dev;
6625}
6626EXPORT_SYMBOL(netdev_lower_get_next);
6627
6628static struct net_device *netdev_next_lower_dev(struct net_device *dev,
6629                                                struct list_head **iter)
6630{
6631        struct netdev_adjacent *lower;
6632
6633        lower = list_entry((*iter)->next, struct netdev_adjacent, list);
6634
6635        if (&lower->list == &dev->adj_list.lower)
6636                return NULL;
6637
6638        *iter = &lower->list;
6639
6640        return lower->dev;
6641}
6642
6643int netdev_walk_all_lower_dev(struct net_device *dev,
6644                              int (*fn)(struct net_device *dev,
6645                                        void *data),
6646                              void *data)
6647{
6648        struct net_device *ldev;
6649        struct list_head *iter;
6650        int ret;
6651
6652        for (iter = &dev->adj_list.lower,
6653             ldev = netdev_next_lower_dev(dev, &iter);
6654             ldev;
6655             ldev = netdev_next_lower_dev(dev, &iter)) {
6656                /* first is the lower device itself */
6657                ret = fn(ldev, data);
6658                if (ret)
6659                        return ret;
6660
6661                /* then look at all of its lower devices */
6662                ret = netdev_walk_all_lower_dev(ldev, fn, data);
6663                if (ret)
6664                        return ret;
6665        }
6666
6667        return 0;
6668}
6669EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
6670
6671static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
6672                                                    struct list_head **iter)
6673{
6674        struct netdev_adjacent *lower;
6675
6676        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6677        if (&lower->list == &dev->adj_list.lower)
6678                return NULL;
6679
6680        *iter = &lower->list;
6681
6682        return lower->dev;
6683}
6684
6685int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
6686                                  int (*fn)(struct net_device *dev,
6687                                            void *data),
6688                                  void *data)
6689{
6690        struct net_device *ldev;
6691        struct list_head *iter;
6692        int ret;
6693
6694        for (iter = &dev->adj_list.lower,
6695             ldev = netdev_next_lower_dev_rcu(dev, &iter);
6696             ldev;
6697             ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
6698                /* first is the lower device itself */
6699                ret = fn(ldev, data);
6700                if (ret)
6701                        return ret;
6702
6703                /* then look at all of its lower devices */
6704                ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
6705                if (ret)
6706                        return ret;
6707        }
6708
6709        return 0;
6710}
6711EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
6712
6713/**
6714 * netdev_lower_get_first_private_rcu - Get the first ->private from the
6715 *                                     lower neighbour list, RCU
6716 *                                     variant
6717 * @dev: device
6718 *
6719 * Gets the first netdev_adjacent->private from the dev's lower neighbour
6720 * list. The caller must hold RCU read lock.
6721 */
6722void *netdev_lower_get_first_private_rcu(struct net_device *dev)
6723{
6724        struct netdev_adjacent *lower;
6725
6726        lower = list_first_or_null_rcu(&dev->adj_list.lower,
6727                        struct netdev_adjacent, list);
6728        if (lower)
6729                return lower->private;
6730        return NULL;
6731}
6732EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
6733
6734/**
6735 * netdev_master_upper_dev_get_rcu - Get master upper device
6736 * @dev: device
6737 *
6738 * Find a master upper device and return pointer to it or NULL in case
6739 * it's not there. The caller must hold the RCU read lock.
6740 */
6741struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
6742{
6743        struct netdev_adjacent *upper;
6744
6745        upper = list_first_or_null_rcu(&dev->adj_list.upper,
6746                                       struct netdev_adjacent, list);
6747        if (upper && likely(upper->master))
6748                return upper->dev;
6749        return NULL;
6750}
6751EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
6752
6753static int netdev_adjacent_sysfs_add(struct net_device *dev,
6754                              struct net_device *adj_dev,
6755                              struct list_head *dev_list)
6756{
6757        char linkname[IFNAMSIZ+7];
6758
6759        sprintf(linkname, dev_list == &dev->adj_list.upper ?
6760                "upper_%s" : "lower_%s", adj_dev->name);
6761        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
6762                                 linkname);
6763}
6764static void netdev_adjacent_sysfs_del(struct net_device *dev,
6765                               char *name,
6766                               struct list_head *dev_list)
6767{
6768        char linkname[IFNAMSIZ+7];
6769
6770        sprintf(linkname, dev_list == &dev->adj_list.upper ?
6771                "upper_%s" : "lower_%s", name);
6772        sysfs_remove_link(&(dev->dev.kobj), linkname);
6773}
6774
6775static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
6776                                                 struct net_device *adj_dev,
6777                                                 struct list_head *dev_list)
6778{
6779        return (dev_list == &dev->adj_list.upper ||
6780                dev_list == &dev->adj_list.lower) &&
6781                net_eq(dev_net(dev), dev_net(adj_dev));
6782}
6783
6784static int __netdev_adjacent_dev_insert(struct net_device *dev,
6785                                        struct net_device *adj_dev,
6786                                        struct list_head *dev_list,
6787                                        void *private, bool master)
6788{
6789        struct netdev_adjacent *adj;
6790        int ret;
6791
6792        adj = __netdev_find_adj(adj_dev, dev_list);
6793
6794        if (adj) {
6795                adj->ref_nr += 1;
6796                pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
6797                         dev->name, adj_dev->name, adj->ref_nr);
6798
6799                return 0;
6800        }
6801
6802        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
6803        if (!adj)
6804                return -ENOMEM;
6805
6806        adj->dev = adj_dev;
6807        adj->master = master;
6808        adj->ref_nr = 1;
6809        adj->private = private;
6810        dev_hold(adj_dev);
6811
6812        pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
6813                 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
6814
6815        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
6816                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
6817                if (ret)
6818                        goto free_adj;
6819        }
6820
6821        /* Ensure that master link is always the first item in list. */
6822        if (master) {
6823                ret = sysfs_create_link(&(dev->dev.kobj),
6824                                        &(adj_dev->dev.kobj), "master");
6825                if (ret)
6826                        goto remove_symlinks;
6827
6828                list_add_rcu(&adj->list, dev_list);
6829        } else {
6830                list_add_tail_rcu(&adj->list, dev_list);
6831        }
6832
6833        return 0;
6834
6835remove_symlinks:
6836        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
6837                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
6838free_adj:
6839        kfree(adj);
6840        dev_put(adj_dev);
6841
6842        return ret;
6843}
6844
6845static void __netdev_adjacent_dev_remove(struct net_device *dev,
6846                                         struct net_device *adj_dev,
6847                                         u16 ref_nr,
6848                                         struct list_head *dev_list)
6849{
6850        struct netdev_adjacent *adj;
6851
6852        pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
6853                 dev->name, adj_dev->name, ref_nr);
6854
6855        adj = __netdev_find_adj(adj_dev, dev_list);
6856
6857        if (!adj) {
6858                pr_err("Adjacency does not exist for device %s from %s\n",
6859                       dev->name, adj_dev->name);
6860                WARN_ON(1);
6861                return;
6862        }
6863
6864        if (adj->ref_nr > ref_nr) {
6865                pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
6866                         dev->name, adj_dev->name, ref_nr,
6867                         adj->ref_nr - ref_nr);
6868                adj->ref_nr -= ref_nr;
6869                return;
6870        }
6871
6872        if (adj->master)
6873                sysfs_remove_link(&(dev->dev.kobj), "master");
6874
6875        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
6876                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
6877
6878        list_del_rcu(&adj->list);
6879        pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
6880                 adj_dev->name, dev->name, adj_dev->name);
6881        dev_put(adj_dev);
6882        kfree_rcu(adj, rcu);
6883}
6884
6885static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
6886                                            struct net_device *upper_dev,
6887                                            struct list_head *up_list,
6888                                            struct list_head *down_list,
6889                                            void *private, bool master)
6890{
6891        int ret;
6892
6893        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
6894                                           private, master);
6895        if (ret)
6896                return ret;
6897
6898        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
6899                                           private, false);
6900        if (ret) {
6901                __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
6902                return ret;
6903        }
6904
6905        return 0;
6906}
6907
6908static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
6909                                               struct net_device *upper_dev,
6910                                               u16 ref_nr,
6911                                               struct list_head *up_list,
6912                                               struct list_head *down_list)
6913{
6914        __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
6915        __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
6916}
6917
6918static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
6919                                                struct net_device *upper_dev,
6920                                                void *private, bool master)
6921{
6922        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
6923                                                &dev->adj_list.upper,
6924                                                &upper_dev->adj_list.lower,
6925                                                private, master);
6926}
6927
6928static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
6929                                                   struct net_device *upper_dev)
6930{
6931        __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
6932                                           &dev->adj_list.upper,
6933                                           &upper_dev->adj_list.lower);
6934}
6935
6936static int __netdev_upper_dev_link(struct net_device *dev,
6937                                   struct net_device *upper_dev, bool master,
6938                                   void *upper_priv, void *upper_info,
6939                                   struct netlink_ext_ack *extack)
6940{
6941        struct netdev_notifier_changeupper_info changeupper_info = {
6942                .info = {
6943                        .dev = dev,
6944                        .extack = extack,
6945                },
6946                .upper_dev = upper_dev,
6947                .master = master,
6948                .linking = true,
6949                .upper_info = upper_info,
6950        };
6951        struct net_device *master_dev;
6952        int ret = 0;
6953
6954        ASSERT_RTNL();
6955
6956        if (dev == upper_dev)
6957                return -EBUSY;
6958
6959        /* To prevent loops, check if dev is not upper device to upper_dev. */
6960        if (netdev_has_upper_dev(upper_dev, dev))
6961                return -EBUSY;
6962
6963        if (!master) {
6964                if (netdev_has_upper_dev(dev, upper_dev))
6965                        return -EEXIST;
6966        } else {
6967                master_dev = netdev_master_upper_dev_get(dev);
6968                if (master_dev)
6969                        return master_dev == upper_dev ? -EEXIST : -EBUSY;
6970        }
6971
6972        ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
6973                                            &changeupper_info.info);
6974        ret = notifier_to_errno(ret);
6975        if (ret)
6976                return ret;
6977
6978        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
6979                                                   master);
6980        if (ret)
6981                return ret;
6982
6983        ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
6984                                            &changeupper_info.info);
6985        ret = notifier_to_errno(ret);
6986        if (ret)
6987                goto rollback;
6988
6989        return 0;
6990
6991rollback:
6992        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6993
6994        return ret;
6995}
6996
6997/**
6998 * netdev_upper_dev_link - Add a link to the upper device
6999 * @dev: device
7000 * @upper_dev: new upper device

7001 * @extack: netlink extended ack
7002 *
7003 * Adds a link to device which is upper to this one. The caller must hold
7004 * the RTNL lock. On a failure a negative errno code is returned.
7005 * On success the reference counts are adjusted and the function
7006 * returns zero.
7007 */
7008int netdev_upper_dev_link(struct net_device *dev,
7009                          struct net_device *upper_dev,
7010                          struct netlink_ext_ack *extack)
7011{
7012        return __netdev_upper_dev_link(dev, upper_dev, false,
7013                                       NULL, NULL, extack);
7014}
7015EXPORT_SYMBOL(netdev_upper_dev_link);
7016
7017/**
7018 * netdev_master_upper_dev_link - Add a master link to the upper device
7019 * @dev: device
7020 * @upper_dev: new upper device
7021 * @upper_priv: upper device private
7022 * @upper_info: upper info to be passed down via notifier
7023 * @extack: netlink extended ack
7024 *
7025 * Adds a link to device which is upper to this one. In this case, only
7026 * one master upper device can be linked, although other non-master devices
7027 * might be linked as well. The caller must hold the RTNL lock.
7028 * On a failure a negative errno code is returned. On success the reference
7029 * counts are adjusted and the function returns zero.
7030 */
7031int netdev_master_upper_dev_link(struct net_device *dev,
7032                                 struct net_device *upper_dev,
7033                                 void *upper_priv, void *upper_info,
7034                                 struct netlink_ext_ack *extack)
7035{
7036        return __netdev_upper_dev_link(dev, upper_dev, true,
7037                                       upper_priv, upper_info, extack);
7038}
7039EXPORT_SYMBOL(netdev_master_upper_dev_link);
7040
7041/**
7042 * netdev_upper_dev_unlink - Removes a link to upper device
7043 * @dev: device
7044 * @upper_dev: new upper device
7045 *
7046 * Removes a link to device which is upper to this one. The caller must hold
7047 * the RTNL lock.
7048 */
7049void netdev_upper_dev_unlink(struct net_device *dev,
7050                             struct net_device *upper_dev)
7051{
7052        struct netdev_notifier_changeupper_info changeupper_info = {
7053                .info = {
7054                        .dev = dev,
7055                },
7056                .upper_dev = upper_dev,
7057                .linking = false,
7058        };
7059
7060        ASSERT_RTNL();
7061
7062        changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7063
7064        call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7065                                      &changeupper_info.info);
7066
7067        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7068
7069        call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7070                                      &changeupper_info.info);
7071}
7072EXPORT_SYMBOL(netdev_upper_dev_unlink);
7073
7074/**
7075 * netdev_bonding_info_change - Dispatch event about slave change
7076 * @dev: device
7077 * @bonding_info: info to dispatch
7078 *
7079 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
7080 * The caller must hold the RTNL lock.
7081 */
7082void netdev_bonding_info_change(struct net_device *dev,
7083                                struct netdev_bonding_info *bonding_info)
7084{
7085        struct netdev_notifier_bonding_info info = {
7086                .info.dev = dev,
7087        };
7088
7089        memcpy(&info.bonding_info, bonding_info,
7090               sizeof(struct netdev_bonding_info));
7091        call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
7092                                      &info.info);
7093}
7094EXPORT_SYMBOL(netdev_bonding_info_change);
7095
7096static void netdev_adjacent_add_links(struct net_device *dev)
7097{
7098        struct netdev_adjacent *iter;
7099
7100        struct net *net = dev_net(dev);
7101
7102        list_for_each_entry(iter, &dev->adj_list.upper, list) {
7103                if (!net_eq(net, dev_net(iter->dev)))
7104                        continue;
7105                netdev_adjacent_sysfs_add(iter->dev, dev,
7106                                          &iter->dev->adj_list.lower);
7107                netdev_adjacent_sysfs_add(dev, iter->dev,
7108                                          &dev->adj_list.upper);
7109        }
7110
7111        list_for_each_entry(iter, &dev->adj_list.lower, list) {
7112                if (!net_eq(net, dev_net(iter->dev)))
7113                        continue;
7114                netdev_adjacent_sysfs_add(iter->dev, dev,
7115                                          &iter->dev->adj_list.upper);
7116                netdev_adjacent_sysfs_add(dev, iter->dev,
7117                                          &dev->adj_list.lower);
7118        }
7119}
7120
7121static void netdev_adjacent_del_links(struct net_device *dev)
7122{
7123        struct netdev_adjacent *iter;
7124
7125        struct net *net = dev_net(dev);
7126
7127        list_for_each_entry(iter, &dev->adj_list.upper, list) {
7128                if (!net_eq(net, dev_net(iter->dev)))
7129                        continue;
7130                netdev_adjacent_sysfs_del(iter->dev, dev->name,
7131                                          &iter->dev->adj_list.lower);
7132                netdev_adjacent_sysfs_del(dev, iter->dev->name,
7133                                          &dev->adj_list.upper);
7134        }
7135
7136        list_for_each_entry(iter, &dev->adj_list.lower, list) {
7137                if (!net_eq(net, dev_net(iter->dev)))
7138                        continue;
7139                netdev_adjacent_sysfs_del(iter->dev, dev->name,
7140                                          &iter->dev->adj_list.upper);
7141                netdev_adjacent_sysfs_del(dev, iter->dev->name,
7142                                          &dev->adj_list.lower);
7143        }
7144}
7145
7146void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
7147{
7148        struct netdev_adjacent *iter;
7149
7150        struct net *net = dev_net(dev);
7151
7152        list_for_each_entry(iter, &dev->adj_list.upper, list) {
7153                if (!net_eq(net, dev_net(iter->dev)))
7154                        continue;
7155                netdev_adjacent_sysfs_del(iter->dev, oldname,
7156                                          &iter->dev->adj_list.lower);
7157                netdev_adjacent_sysfs_add(iter->dev, dev,
7158                                          &iter->dev->adj_list.lower);
7159        }
7160
7161        list_for_each_entry(iter, &dev->adj_list.lower, list) {
7162                if (!net_eq(net, dev_net(iter->dev)))
7163                        continue;
7164                netdev_adjacent_sysfs_del(iter->dev, oldname,
7165                                          &iter->dev->adj_list.upper);
7166                netdev_adjacent_sysfs_add(iter->dev, dev,
7167                                          &iter->dev->adj_list.upper);
7168        }
7169}
7170
7171void *netdev_lower_dev_get_private(struct net_device *dev,
7172                                   struct net_device *lower_dev)
7173{
7174        struct netdev_adjacent *lower;
7175
7176        if (!lower_dev)
7177                return NULL;
7178        lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
7179        if (!lower)
7180                return NULL;
7181
7182        return lower->private;
7183}
7184EXPORT_SYMBOL(netdev_lower_dev_get_private);
7185
7186
7187int dev_get_nest_level(struct net_device *dev)
7188{
7189        struct net_device *lower = NULL;
7190        struct list_head *iter;
7191        int max_nest = -1;
7192        int nest;
7193
7194        ASSERT_RTNL();
7195
7196        netdev_for_each_lower_dev(dev, lower, iter) {
7197                nest = dev_get_nest_level(lower);
7198                if (max_nest < nest)
7199                        max_nest = nest;
7200        }
7201
7202        return max_nest + 1;
7203}
7204EXPORT_SYMBOL(dev_get_nest_level);
7205
7206/**
7207 * netdev_lower_change - Dispatch event about lower device state change
7208 * @lower_dev: device
7209 * @lower_state_info: state to dispatch
7210 *
7211 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
7212 * The caller must hold the RTNL lock.
7213 */
7214void netdev_lower_state_changed(struct net_device *lower_dev,
7215                                void *lower_state_info)
7216{
7217        struct netdev_notifier_changelowerstate_info changelowerstate_info = {
7218                .info.dev = lower_dev,
7219        };
7220
7221        ASSERT_RTNL();
7222        changelowerstate_info.lower_state_info = lower_state_info;
7223        call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
7224                                      &changelowerstate_info.info);
7225}
7226EXPORT_SYMBOL(netdev_lower_state_changed);
7227
7228static void dev_change_rx_flags(struct net_device *dev, int flags)
7229{
7230        const struct net_device_ops *ops = dev->netdev_ops;
7231
7232        if (ops->ndo_change_rx_flags)
7233                ops->ndo_change_rx_flags(dev, flags);
7234}
7235
7236static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
7237{
7238        unsigned int old_flags = dev->flags;
7239        kuid_t uid;
7240        kgid_t gid;
7241
7242        ASSERT_RTNL();
7243
7244        dev->flags |= IFF_PROMISC;
7245        dev->promiscuity += inc;
7246        if (dev->promiscuity == 0) {
7247                /*
7248                 * Avoid overflow.
7249                 * If inc causes overflow, untouch promisc and return error.
7250                 */
7251                if (inc < 0)
7252                        dev->flags &= ~IFF_PROMISC;
7253                else {
7254                        dev->promiscuity -= inc;
7255                        pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
7256                                dev->name);
7257                        return -EOVERFLOW;
7258                }
7259        }
7260        if (dev->flags != old_flags) {
7261                pr_info("device %s %s promiscuous mode\n",
7262                        dev->name,
7263                        dev->flags & IFF_PROMISC ? "entered" : "left");
7264                if (audit_enabled) {
7265                        current_uid_gid(&uid, &gid);
7266                        audit_log(audit_context(), GFP_ATOMIC,
7267                                  AUDIT_ANOM_PROMISCUOUS,
7268                                  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
7269                                  dev->name, (dev->flags & IFF_PROMISC),
7270                                  (old_flags & IFF_PROMISC),
7271                                  from_kuid(&init_user_ns, audit_get_loginuid(current)),
7272                                  from_kuid(&init_user_ns, uid),
7273                                  from_kgid(&init_user_ns, gid),
7274                                  audit_get_sessionid(current));
7275                }
7276
7277                dev_change_rx_flags(dev, IFF_PROMISC);
7278        }
7279        if (notify)
7280                __dev_notify_flags(dev, old_flags, IFF_PROMISC);
7281        return 0;
7282}
7283
7284/**
7285 *      dev_set_promiscuity     - update promiscuity count on a device
7286 *      @dev: device
7287 *      @inc: modifier
7288 *
7289 *      Add or remove promiscuity from a device. While the count in the device
7290 *      remains above zero the interface remains promiscuous. Once it hits zero
7291 *      the device reverts back to normal filtering operation. A negative inc
7292 *      value is used to drop promiscuity on the device.
7293 *      Return 0 if successful or a negative errno code on error.
7294 */
7295int dev_set_promiscuity(struct net_device *dev, int inc)
7296{
7297        unsigned int old_flags = dev->flags;
7298        int err;
7299
7300        err = __dev_set_promiscuity(dev, inc, true);
7301        if (err < 0)
7302                return err;
7303        if (dev->flags != old_flags)
7304                dev_set_rx_mode(dev);
7305        return err;
7306}
7307EXPORT_SYMBOL(dev_set_promiscuity);
7308
7309static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
7310{
7311        unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
7312
7313        ASSERT_RTNL();
7314
7315        dev->flags |= IFF_ALLMULTI;
7316        dev->allmulti += inc;
7317        if (dev->allmulti == 0) {
7318                /*
7319                 * Avoid overflow.
7320                 * If inc causes overflow, untouch allmulti and return error.
7321                 */
7322                if (inc < 0)
7323                        dev->flags &= ~IFF_ALLMULTI;
7324                else {
7325                        dev->allmulti -= inc;
7326                        pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
7327                                dev->name);
7328                        return -EOVERFLOW;
7329                }
7330        }
7331        if (dev->flags ^ old_flags) {
7332                dev_change_rx_flags(dev, IFF_ALLMULTI);
7333                dev_set_rx_mode(dev);
7334                if (notify)
7335                        __dev_notify_flags(dev, old_flags,
7336                                           dev->gflags ^ old_gflags);
7337        }
7338        return 0;
7339}
7340
7341/**
7342 *      dev_set_allmulti        - update allmulti count on a device
7343 *      @dev: device
7344 *      @inc: modifier
7345 *
7346 *      Add or remove reception of all multicast frames to a device. While the
7347 *      count in the device remains above zero the interface remains listening
7348 *      to all interfaces. Once it hits zero the device reverts back to normal
7349 *      filtering operation. A negative @inc value is used to drop the counter
7350 *      when releasing a resource needing all multicasts.
7351 *      Return 0 if successful or a negative errno code on error.
7352 */
7353
7354int dev_set_allmulti(struct net_device *dev, int inc)
7355{
7356        return __dev_set_allmulti(dev, inc, true);
7357}
7358EXPORT_SYMBOL(dev_set_allmulti);
7359
7360/*
7361 *      Upload unicast and multicast address lists to device and
7362 *      configure RX filtering. When the device doesn't support unicast
7363 *      filtering it is put in promiscuous mode while unicast addresses
7364 *      are present.
7365 */
7366void __dev_set_rx_mode(struct net_device *dev)
7367{
7368        const struct net_device_ops *ops = dev->netdev_ops;
7369
7370        /* dev_open will call this function so the list will stay sane. */
7371        if (!(dev->flags&IFF_UP))
7372                return;
7373
7374        if (!netif_device_present(dev))
7375                return;
7376
7377        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
7378                /* Unicast addresses changes may only happen under the rtnl,
7379                 * therefore calling __dev_set_promiscuity here is safe.
7380                 */
7381                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
7382                        __dev_set_promiscuity(dev, 1, false);
7383                        dev->uc_promisc = true;
7384                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
7385                        __dev_set_promiscuity(dev, -1, false);
7386                        dev->uc_promisc = false;
7387                }
7388        }
7389
7390        if (ops->ndo_set_rx_mode)
7391                ops->ndo_set_rx_mode(dev);
7392}
7393
7394void dev_set_rx_mode(struct net_device *dev)
7395{
7396        netif_addr_lock_bh(dev);
7397        __dev_set_rx_mode(dev);
7398        netif_addr_unlock_bh(dev);
7399}
7400
7401/**
7402 *      dev_get_flags - get flags reported to userspace
7403 *      @dev: device
7404 *
7405 *      Get the combination of flag bits exported through APIs to userspace.
7406 */
7407unsigned int dev_get_flags(const struct net_device *dev)
7408{
7409        unsigned int flags;
7410
7411        flags = (dev->flags & ~(IFF_PROMISC |
7412                                IFF_ALLMULTI |
7413                                IFF_RUNNING |
7414                                IFF_LOWER_UP |
7415                                IFF_DORMANT)) |
7416                (dev->gflags & (IFF_PROMISC |
7417                                IFF_ALLMULTI));
7418
7419        if (netif_running(dev)) {
7420                if (netif_oper_up(dev))
7421                        flags |= IFF_RUNNING;
7422                if (netif_carrier_ok(dev))
7423                        flags |= IFF_LOWER_UP;
7424                if (netif_dormant(dev))
7425                        flags |= IFF_DORMANT;
7426        }
7427
7428        return flags;
7429}
7430EXPORT_SYMBOL(dev_get_flags);
7431
7432int __dev_change_flags(struct net_device *dev, unsigned int flags)
7433{
7434        unsigned int old_flags = dev->flags;
7435        int ret;
7436
7437        ASSERT_RTNL();
7438
7439        /*
7440         *      Set the flags on our device.
7441         */
7442
7443        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
7444                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
7445                               IFF_AUTOMEDIA)) |
7446                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
7447                                    IFF_ALLMULTI));
7448
7449        /*
7450         *      Load in the correct multicast list now the flags have changed.
7451         */
7452
7453        if ((old_flags ^ flags) & IFF_MULTICAST)
7454                dev_change_rx_flags(dev, IFF_MULTICAST);
7455
7456        dev_set_rx_mode(dev);
7457
7458        /*
7459         *      Have we downed the interface. We handle IFF_UP ourselves
7460         *      according to user attempts to set it, rather than blindly
7461         *      setting it.
7462         */
7463
7464        ret = 0;
7465        if ((old_flags ^ flags) & IFF_UP) {
7466                if (old_flags & IFF_UP)
7467                        __dev_close(dev);
7468                else
7469                        ret = __dev_open(dev);
7470        }
7471
7472        if ((flags ^ dev->gflags) & IFF_PROMISC) {
7473                int inc = (flags & IFF_PROMISC) ? 1 : -1;
7474                unsigned int old_flags = dev->flags;
7475
7476                dev->gflags ^= IFF_PROMISC;
7477
7478                if (__dev_set_promiscuity(dev, inc, false) >= 0)
7479                        if (dev->flags != old_flags)
7480                                dev_set_rx_mode(dev);
7481        }
7482
7483        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
7484         * is important. Some (broken) drivers set IFF_PROMISC, when
7485         * IFF_ALLMULTI is requested not asking us and not reporting.
7486         */
7487        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
7488                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
7489
7490                dev->gflags ^= IFF_ALLMULTI;
7491                __dev_set_allmulti(dev, inc, false);
7492        }
7493
7494        return ret;
7495}
7496
7497void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
7498                        unsigned int gchanges)
7499{
7500        unsigned int changes = dev->flags ^ old_flags;
7501
7502        if (gchanges)
7503                rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
7504
7505        if (changes & IFF_UP) {
7506                if (dev->flags & IFF_UP)
7507                        call_netdevice_notifiers(NETDEV_UP, dev);
7508                else
7509                        call_netdevice_notifiers(NETDEV_DOWN, dev);
7510        }
7511
7512        if (dev->flags & IFF_UP &&
7513            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
7514                struct netdev_notifier_change_info change_info = {
7515                        .info = {
7516                                .dev = dev,
7517                        },
7518                        .flags_changed = changes,
7519                };
7520
7521                call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
7522        }
7523}
7524
7525/**
7526 *      dev_change_flags - change device settings
7527 *      @dev: device
7528 *      @flags: device state flags
7529 *
7530 *      Change settings on device based state flags. The flags are
7531 *      in the userspace exported format.
7532 */
7533int dev_change_flags(struct net_device *dev, unsigned int flags)
7534{
7535        int ret;
7536        unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
7537
7538        ret = __dev_change_flags(dev, flags);
7539        if (ret < 0)
7540                return ret;
7541
7542        changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
7543        __dev_notify_flags(dev, old_flags, changes);
7544        return ret;
7545}
7546EXPORT_SYMBOL(dev_change_flags);
7547
7548int __dev_set_mtu(struct net_device *dev, int new_mtu)
7549{
7550        const struct net_device_ops *ops = dev->netdev_ops;
7551
7552        if (ops->ndo_change_mtu)
7553                return ops->ndo_change_mtu(dev, new_mtu);
7554
7555        dev->mtu = new_mtu;
7556        return 0;
7557}
7558EXPORT_SYMBOL(__dev_set_mtu);
7559
7560/**
7561 *      dev_set_mtu_ext - Change maximum transfer unit
7562 *      @dev: device
7563 *      @new_mtu: new transfer unit
7564 *      @extack: netlink extended ack
7565 *
7566 *      Change the maximum transfer size of the network device.
7567 */
7568int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
7569                    struct netlink_ext_ack *extack)
7570{
7571        int err, orig_mtu;
7572
7573        if (new_mtu == dev->mtu)
7574                return 0;
7575
7576        /* MTU must be positive, and in range */
7577        if (new_mtu < 0 || new_mtu < dev->min_mtu) {
7578                NL_SET_ERR_MSG(extack, "mtu less than device minimum");
7579                return -EINVAL;
7580        }
7581
7582        if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
7583                NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
7584                return -EINVAL;
7585        }
7586
7587        if (!netif_device_present(dev))
7588                return -ENODEV;
7589
7590        err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
7591        err = notifier_to_errno(err);
7592        if (err)
7593                return err;
7594
7595        orig_mtu = dev->mtu;
7596        err = __dev_set_mtu(dev, new_mtu);
7597
7598        if (!err) {
7599                err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
7600                                                   orig_mtu);
7601                err = notifier_to_errno(err);
7602                if (err) {
7603                        /* setting mtu back and notifying everyone again,
7604                         * so that they have a chance to revert changes.
7605                         */
7606                        __dev_set_mtu(dev, orig_mtu);
7607                        call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
7608                                                     new_mtu);
7609                }
7610        }
7611        return err;
7612}
7613
7614int dev_set_mtu(struct net_device *dev, int new_mtu)
7615{
7616        struct netlink_ext_ack extack;
7617        int err;
7618
7619        memset(&extack, 0, sizeof(extack));
7620        err = dev_set_mtu_ext(dev, new_mtu, &extack);
7621        if (err && extack._msg)
7622                net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
7623        return err;
7624}
7625EXPORT_SYMBOL(dev_set_mtu);
7626
7627/**
7628 *      dev_change_tx_queue_len - Change TX queue length of a netdevice
7629 *      @dev: device
7630 *      @new_len: new tx queue length
7631 */
7632int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
7633{
7634        unsigned int orig_len = dev->tx_queue_len;
7635        int res;
7636
7637        if (new_len != (unsigned int)new_len)
7638                return -ERANGE;
7639
7640        if (new_len != orig_len) {
7641                dev->tx_queue_len = new_len;
7642                res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
7643                res = notifier_to_errno(res);
7644                if (res)
7645                        goto err_rollback;
7646                res = dev_qdisc_change_tx_queue_len(dev);
7647                if (res)
7648                        goto err_rollback;
7649        }
7650
7651        return 0;
7652
7653err_rollback:
7654        netdev_err(dev, "refused to change device tx_queue_len\n");
7655        dev->tx_queue_len = orig_len;
7656        return res;
7657}
7658
7659/**
7660 *      dev_set_group - Change group this device belongs to
7661 *      @dev: device
7662 *      @new_group: group this device should belong to
7663 */
7664void dev_set_group(struct net_device *dev, int new_group)
7665{
7666        dev->group = new_group;
7667}
7668EXPORT_SYMBOL(dev_set_group);
7669
7670/**
7671 *      dev_set_mac_address - Change Media Access Control Address
7672 *      @dev: device
7673 *      @sa: new address
7674 *
7675 *      Change the hardware (MAC) address of the device
7676 */
7677int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
7678{
7679        const struct net_device_ops *ops = dev->netdev_ops;
7680        int err;
7681
7682        if (!ops->ndo_set_mac_address)
7683                return -EOPNOTSUPP;
7684        if (sa->sa_family != dev->type)
7685                return -EINVAL;
7686        if (!netif_device_present(dev))
7687                return -ENODEV;
7688        err = ops->ndo_set_mac_address(dev, sa);
7689        if (err)
7690                return err;
7691        dev->addr_assign_type = NET_ADDR_SET;
7692        call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7693        add_device_randomness(dev->dev_addr, dev->addr_len);
7694        return 0;
7695}
7696EXPORT_SYMBOL(dev_set_mac_address);
7697
7698/**
7699 *      dev_change_carrier - Change device carrier
7700 *      @dev: device
7701 *      @new_carrier: new value
7702 *
7703 *      Change device carrier
7704 */
7705int dev_change_carrier(struct net_device *dev, bool new_carrier)
7706{
7707        const struct net_device_ops *ops = dev->netdev_ops;
7708
7709        if (!ops->ndo_change_carrier)
7710                return -EOPNOTSUPP;
7711        if (!netif_device_present(dev))
7712                return -ENODEV;
7713        return ops->ndo_change_carrier(dev, new_carrier);
7714}
7715EXPORT_SYMBOL(dev_change_carrier);
7716
7717/**
7718 *      dev_get_phys_port_id - Get device physical port ID
7719 *      @dev: device
7720 *      @ppid: port ID
7721 *
7722 *      Get device physical port ID
7723 */
7724int dev_get_phys_port_id(struct net_device *dev,
7725                         struct netdev_phys_item_id *ppid)
7726{
7727        const struct net_device_ops *ops = dev->netdev_ops;
7728
7729        if (!ops->ndo_get_phys_port_id)
7730                return -EOPNOTSUPP;
7731        return ops->ndo_get_phys_port_id(dev, ppid);
7732}
7733EXPORT_SYMBOL(dev_get_phys_port_id);
7734
7735/**
7736 *      dev_get_phys_port_name - Get device physical port name
7737 *      @dev: device
7738 *      @name: port name
7739 *      @len: limit of bytes to copy to name
7740 *
7741 *      Get device physical port name
7742 */
7743int dev_get_phys_port_name(struct net_device *dev,
7744                           char *name, size_t len)
7745{
7746        const struct net_device_ops *ops = dev->netdev_ops;
7747
7748        if (!ops->ndo_get_phys_port_name)
7749                return -EOPNOTSUPP;
7750        return ops->ndo_get_phys_port_name(dev, name, len);
7751}
7752EXPORT_SYMBOL(dev_get_phys_port_name);
7753
7754/**
7755 *      dev_change_proto_down - update protocol port state information
7756 *      @dev: device
7757 *      @proto_down: new value
7758 *
7759 *      This info can be used by switch drivers to set the phys state of the
7760 *      port.
7761 */
7762int dev_change_proto_down(struct net_device *dev, bool proto_down)
7763{
7764        const struct net_device_ops *ops = dev->netdev_ops;
7765
7766        if (!ops->ndo_change_proto_down)
7767                return -EOPNOTSUPP;
7768        if (!netif_device_present(dev))
7769                return -ENODEV;
7770        return ops->ndo_change_proto_down(dev, proto_down);
7771}
7772EXPORT_SYMBOL(dev_change_proto_down);
7773
7774u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
7775                    enum bpf_netdev_command cmd)
7776{
7777        struct netdev_bpf xdp;
7778
7779        if (!bpf_op)
7780                return 0;
7781
7782        memset(&xdp, 0, sizeof(xdp));
7783        xdp.command = cmd;
7784
7785        /* Query must always succeed. */
7786        WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
7787
7788        return xdp.prog_id;
7789}
7790
7791static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
7792                           struct netlink_ext_ack *extack, u32 flags,
7793                           struct bpf_prog *prog)
7794{
7795        struct netdev_bpf xdp;
7796
7797        memset(&xdp, 0, sizeof(xdp));
7798        if (flags & XDP_FLAGS_HW_MODE)
7799                xdp.command = XDP_SETUP_PROG_HW;
7800        else
7801                xdp.command = XDP_SETUP_PROG;
7802        xdp.extack = extack;
7803        xdp.flags = flags;
7804        xdp.prog = prog;
7805
7806        return bpf_op(dev, &xdp);
7807}
7808
7809static void dev_xdp_uninstall(struct net_device *dev)
7810{
7811        struct netdev_bpf xdp;
7812        bpf_op_t ndo_bpf;
7813
7814        /* Remove generic XDP */
7815        WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
7816
7817        /* Remove from the driver */
7818        ndo_bpf = dev->netdev_ops->ndo_bpf;
7819        if (!ndo_bpf)
7820                return;
7821
7822        memset(&xdp, 0, sizeof(xdp));
7823        xdp.command = XDP_QUERY_PROG;
7824        WARN_ON(ndo_bpf(dev, &xdp));
7825        if (xdp.prog_id)
7826                WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
7827                                        NULL));
7828
7829        /* Remove HW offload */
7830        memset(&xdp, 0, sizeof(xdp));
7831        xdp.command = XDP_QUERY_PROG_HW;
7832        if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
7833                WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
7834                                        NULL));
7835}
7836
7837/**
7838 *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
7839 *      @dev: device
7840 *      @extack: netlink extended ack
7841 *      @fd: new program fd or negative value to clear
7842 *      @flags: xdp-related flags
7843 *
7844 *      Set or clear a bpf program for a device
7845 */
7846int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
7847                      int fd, u32 flags)
7848{
7849        const struct net_device_ops *ops = dev->netdev_ops;
7850        enum bpf_netdev_command query;
7851        struct bpf_prog *prog = NULL;
7852        bpf_op_t bpf_op, bpf_chk;
7853        int err;
7854
7855        ASSERT_RTNL();
7856
7857        query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
7858
7859        bpf_op = bpf_chk = ops->ndo_bpf;
7860        if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
7861                return -EOPNOTSUPP;
7862        if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
7863                bpf_op = generic_xdp_install;
7864        if (bpf_op == bpf_chk)
7865                bpf_chk = generic_xdp_install;
7866
7867        if (fd >= 0) {
7868                if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) ||
7869                    __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW))
7870                        return -EEXIST;
7871                if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
7872                    __dev_xdp_query(dev, bpf_op, query))
7873                        return -EBUSY;
7874
7875                prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
7876                                             bpf_op == ops->ndo_bpf);
7877                if (IS_ERR(prog))
7878                        return PTR_ERR(prog);
7879
7880                if (!(flags & XDP_FLAGS_HW_MODE) &&
7881                    bpf_prog_is_dev_bound(prog->aux)) {
7882                        NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
7883                        bpf_prog_put(prog);
7884                        return -EINVAL;
7885                }
7886        }
7887
7888        err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
7889        if (err < 0 && prog)
7890                bpf_prog_put(prog);
7891
7892        return err;
7893}
7894
7895/**
7896 *      dev_new_index   -       allocate an ifindex
7897 *      @net: the applicable net namespace
7898 *
7899 *      Returns a suitable unique value for a new device interface
7900 *      number.  The caller must hold the rtnl semaphore or the
7901 *      dev_base_lock to be sure it remains unique.
7902 */
7903static int dev_new_index(struct net *net)
7904{
7905        int ifindex = net->ifindex;
7906
7907        for (;;) {
7908                if (++ifindex <= 0)
7909                        ifindex = 1;
7910                if (!__dev_get_by_index(net, ifindex))
7911                        return net->ifindex = ifindex;
7912        }
7913}
7914
7915/* Delayed registration/unregisteration */
7916static LIST_HEAD(net_todo_list);
7917DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
7918
7919static void net_set_todo(struct net_device *dev)
7920{
7921        list_add_tail(&dev->todo_list, &net_todo_list);
7922        dev_net(dev)->dev_unreg_count++;
7923}
7924
7925static void rollback_registered_many(struct list_head *head)
7926{
7927        struct net_device *dev, *tmp;
7928        LIST_HEAD(close_head);
7929
7930        BUG_ON(dev_boot_phase);
7931        ASSERT_RTNL();
7932
7933        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
7934                /* Some devices call without registering
7935                 * for initialization unwind. Remove those
7936                 * devices and proceed with the remaining.
7937                 */
7938                if (dev->reg_state == NETREG_UNINITIALIZED) {
7939                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
7940                                 dev->name, dev);
7941
7942                        WARN_ON(1);
7943                        list_del(&dev->unreg_list);
7944                        continue;
7945                }
7946                dev->dismantle = true;
7947                BUG_ON(dev->reg_state != NETREG_REGISTERED);
7948        }
7949
7950        /* If device is running, close it first. */
7951        list_for_each_entry(dev, head, unreg_list)
7952                list_add_tail(&dev->close_list, &close_head);
7953        dev_close_many(&close_head, true);
7954
7955        list_for_each_entry(dev, head, unreg_list) {
7956                /* And unlink it from device chain. */
7957                unlist_netdevice(dev);
7958
7959                dev->reg_state = NETREG_UNREGISTERING;
7960        }
7961        flush_all_backlogs();
7962
7963        synchronize_net();
7964
7965        list_for_each_entry(dev, head, unreg_list) {
7966                struct sk_buff *skb = NULL;
7967
7968                /* Shutdown queueing discipline. */
7969                dev_shutdown(dev);
7970
7971                dev_xdp_uninstall(dev);
7972
7973                /* Notify protocols, that we are about to destroy
7974                 * this device. They should clean all the things.
7975                 */
7976                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7977
7978                if (!dev->rtnl_link_ops ||
7979                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7980                        skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
7981                                                     GFP_KERNEL, NULL, 0);
7982
7983                /*
7984                 *      Flush the unicast and multicast chains
7985                 */
7986                dev_uc_flush(dev);
7987                dev_mc_flush(dev);
7988
7989                if (dev->netdev_ops->ndo_uninit)
7990                        dev->netdev_ops->ndo_uninit(dev);
7991
7992                if (skb)
7993                        rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
7994
7995                /* Notifier chain MUST detach us all upper devices. */
7996                WARN_ON(netdev_has_any_upper_dev(dev));
7997                WARN_ON(netdev_has_any_lower_dev(dev));
7998
7999                /* Remove entries from kobject tree */
8000                netdev_unregister_kobject(dev);

8001#ifdef CONFIG_XPS
8002                /* Remove XPS queueing entries */
8003                netif_reset_xps_queues_gt(dev, 0);
8004#endif
8005        }
8006
8007        synchronize_net();
8008
8009        list_for_each_entry(dev, head, unreg_list)
8010                dev_put(dev);
8011}
8012
8013static void rollback_registered(struct net_device *dev)
8014{
8015        LIST_HEAD(single);
8016
8017        list_add(&dev->unreg_list, &single);
8018        rollback_registered_many(&single);
8019        list_del(&single);
8020}
8021
8022static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
8023        struct net_device *upper, netdev_features_t features)
8024{
8025        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
8026        netdev_features_t feature;
8027        int feature_bit;
8028
8029        for_each_netdev_feature(&upper_disables, feature_bit) {
8030                feature = __NETIF_F_BIT(feature_bit);
8031                if (!(upper->wanted_features & feature)
8032                    && (features & feature)) {
8033                        netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
8034                                   &feature, upper->name);
8035                        features &= ~feature;
8036                }
8037        }
8038
8039        return features;
8040}
8041
8042static void netdev_sync_lower_features(struct net_device *upper,
8043        struct net_device *lower, netdev_features_t features)
8044{
8045        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
8046        netdev_features_t feature;
8047        int feature_bit;
8048
8049        for_each_netdev_feature(&upper_disables, feature_bit) {
8050                feature = __NETIF_F_BIT(feature_bit);
8051                if (!(features & feature) && (lower->features & feature)) {
8052                        netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
8053                                   &feature, lower->name);
8054                        lower->wanted_features &= ~feature;
8055                        netdev_update_features(lower);
8056
8057                        if (unlikely(lower->features & feature))
8058                                netdev_WARN(upper, "failed to disable %pNF on %s!\n",
8059                                            &feature, lower->name);
8060                }
8061        }
8062}
8063
8064static netdev_features_t netdev_fix_features(struct net_device *dev,
8065        netdev_features_t features)
8066{
8067        /* Fix illegal checksum combinations */
8068        if ((features & NETIF_F_HW_CSUM) &&
8069            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
8070                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
8071                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
8072        }
8073
8074        /* TSO requires that SG is present as well. */
8075        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
8076                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
8077                features &= ~NETIF_F_ALL_TSO;
8078        }
8079
8080        if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
8081                                        !(features & NETIF_F_IP_CSUM)) {
8082                netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
8083                features &= ~NETIF_F_TSO;
8084                features &= ~NETIF_F_TSO_ECN;
8085        }
8086
8087        if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
8088                                         !(features & NETIF_F_IPV6_CSUM)) {
8089                netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
8090                features &= ~NETIF_F_TSO6;
8091        }
8092
8093        /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
8094        if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
8095                features &= ~NETIF_F_TSO_MANGLEID;
8096
8097        /* TSO ECN requires that TSO is present as well. */
8098        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
8099                features &= ~NETIF_F_TSO_ECN;
8100
8101        /* Software GSO depends on SG. */
8102        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
8103                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
8104                features &= ~NETIF_F_GSO;
8105        }
8106
8107        /* GSO partial features require GSO partial be set */
8108        if ((features & dev->gso_partial_features) &&
8109            !(features & NETIF_F_GSO_PARTIAL)) {
8110                netdev_dbg(dev,
8111                           "Dropping partially supported GSO features since no GSO partial.\n");
8112                features &= ~dev->gso_partial_features;
8113        }
8114
8115        if (!(features & NETIF_F_RXCSUM)) {
8116                /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
8117                 * successfully merged by hardware must also have the
8118                 * checksum verified by hardware.  If the user does not
8119                 * want to enable RXCSUM, logically, we should disable GRO_HW.
8120                 */
8121                if (features & NETIF_F_GRO_HW) {
8122                        netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
8123                        features &= ~NETIF_F_GRO_HW;
8124                }
8125        }
8126
8127        /* LRO/HW-GRO features cannot be combined with RX-FCS */
8128        if (features & NETIF_F_RXFCS) {
8129                if (features & NETIF_F_LRO) {
8130                        netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
8131                        features &= ~NETIF_F_LRO;
8132                }
8133
8134                if (features & NETIF_F_GRO_HW) {
8135                        netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
8136                        features &= ~NETIF_F_GRO_HW;
8137                }
8138        }
8139
8140        return features;
8141}
8142
8143int __netdev_update_features(struct net_device *dev)
8144{
8145        struct net_device *upper, *lower;
8146        netdev_features_t features;
8147        struct list_head *iter;
8148        int err = -1;
8149
8150        ASSERT_RTNL();
8151
8152        features = netdev_get_wanted_features(dev);
8153
8154        if (dev->netdev_ops->ndo_fix_features)
8155                features = dev->netdev_ops->ndo_fix_features(dev, features);
8156
8157        /* driver might be less strict about feature dependencies */
8158        features = netdev_fix_features(dev, features);
8159
8160        /* some features can't be enabled if they're off an an upper device */
8161        netdev_for_each_upper_dev_rcu(dev, upper, iter)
8162                features = netdev_sync_upper_features(dev, upper, features);
8163
8164        if (dev->features == features)
8165                goto sync_lower;
8166
8167        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
8168                &dev->features, &features);
8169
8170        if (dev->netdev_ops->ndo_set_features)
8171                err = dev->netdev_ops->ndo_set_features(dev, features);
8172        else
8173                err = 0;
8174
8175        if (unlikely(err < 0)) {
8176                netdev_err(dev,
8177                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
8178                        err, &features, &dev->features);
8179                /* return non-0 since some features might have changed and
8180                 * it's better to fire a spurious notification than miss it
8181                 */
8182                return -1;
8183        }
8184
8185sync_lower:
8186        /* some features must be disabled on lower devices when disabled
8187         * on an upper device (think: bonding master or bridge)
8188         */
8189        netdev_for_each_lower_dev(dev, lower, iter)
8190                netdev_sync_lower_features(dev, lower, features);
8191
8192        if (!err) {
8193                netdev_features_t diff = features ^ dev->features;
8194
8195                if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
8196                        /* udp_tunnel_{get,drop}_rx_info both need
8197                         * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
8198                         * device, or they won't do anything.
8199                         * Thus we need to update dev->features
8200                         * *before* calling udp_tunnel_get_rx_info,
8201                         * but *after* calling udp_tunnel_drop_rx_info.
8202                         */
8203                        if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
8204                                dev->features = features;
8205                                udp_tunnel_get_rx_info(dev);
8206                        } else {
8207                                udp_tunnel_drop_rx_info(dev);
8208                        }
8209                }
8210
8211                if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
8212                        if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
8213                                dev->features = features;
8214                                err |= vlan_get_rx_ctag_filter_info(dev);
8215                        } else {
8216                                vlan_drop_rx_ctag_filter_info(dev);
8217                        }
8218                }
8219
8220                if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
8221                        if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
8222                                dev->features = features;
8223                                err |= vlan_get_rx_stag_filter_info(dev);
8224                        } else {
8225                                vlan_drop_rx_stag_filter_info(dev);
8226                        }
8227                }
8228
8229                dev->features = features;
8230        }
8231
8232        return err < 0 ? 0 : 1;
8233}
8234
8235/**
8236 *      netdev_update_features - recalculate device features
8237 *      @dev: the device to check
8238 *
8239 *      Recalculate dev->features set and send notifications if it
8240 *      has changed. Should be called after driver or hardware dependent
8241 *      conditions might have changed that influence the features.
8242 */
8243void netdev_update_features(struct net_device *dev)
8244{
8245        if (__netdev_update_features(dev))
8246                netdev_features_change(dev);
8247}
8248EXPORT_SYMBOL(netdev_update_features);
8249
8250/**
8251 *      netdev_change_features - recalculate device features
8252 *      @dev: the device to check
8253 *
8254 *      Recalculate dev->features set and send notifications even
8255 *      if they have not changed. Should be called instead of
8256 *      netdev_update_features() if also dev->vlan_features might
8257 *      have changed to allow the changes to be propagated to stacked
8258 *      VLAN devices.
8259 */
8260void netdev_change_features(struct net_device *dev)
8261{
8262        __netdev_update_features(dev);
8263        netdev_features_change(dev);
8264}
8265EXPORT_SYMBOL(netdev_change_features);
8266
8267/**
8268 *      netif_stacked_transfer_operstate -      transfer operstate
8269 *      @rootdev: the root or lower level device to transfer state from
8270 *      @dev: the device to transfer operstate to
8271 *
8272 *      Transfer operational state from root to device. This is normally
8273 *      called when a stacking relationship exists between the root
8274 *      device and the device(a leaf device).
8275 */
8276void netif_stacked_transfer_operstate(const struct net_device *rootdev,
8277                                        struct net_device *dev)
8278{
8279        if (rootdev->operstate == IF_OPER_DORMANT)
8280                netif_dormant_on(dev);
8281        else
8282                netif_dormant_off(dev);
8283
8284        if (netif_carrier_ok(rootdev))
8285                netif_carrier_on(dev);
8286        else
8287                netif_carrier_off(dev);
8288}
8289EXPORT_SYMBOL(netif_stacked_transfer_operstate);
8290
8291static int netif_alloc_rx_queues(struct net_device *dev)
8292{
8293        unsigned int i, count = dev->num_rx_queues;
8294        struct netdev_rx_queue *rx;
8295        size_t sz = count * sizeof(*rx);
8296        int err = 0;
8297
8298        BUG_ON(count < 1);
8299
8300        rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
8301        if (!rx)
8302                return -ENOMEM;
8303
8304        dev->_rx = rx;
8305
8306        for (i = 0; i < count; i++) {
8307                rx[i].dev = dev;
8308
8309                /* XDP RX-queue setup */
8310                err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
8311                if (err < 0)
8312                        goto err_rxq_info;
8313        }
8314        return 0;
8315
8316err_rxq_info:
8317        /* Rollback successful reg's and free other resources */
8318        while (i--)
8319                xdp_rxq_info_unreg(&rx[i].xdp_rxq);
8320        kvfree(dev->_rx);
8321        dev->_rx = NULL;
8322        return err;
8323}
8324
8325static void netif_free_rx_queues(struct net_device *dev)
8326{
8327        unsigned int i, count = dev->num_rx_queues;
8328
8329        /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
8330        if (!dev->_rx)
8331                return;
8332
8333        for (i = 0; i < count; i++)
8334                xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
8335
8336        kvfree(dev->_rx);
8337}
8338
8339static void netdev_init_one_queue(struct net_device *dev,
8340                                  struct netdev_queue *queue, void *_unused)
8341{
8342        /* Initialize queue lock */
8343        spin_lock_init(&queue->_xmit_lock);
8344        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
8345        queue->xmit_lock_owner = -1;
8346        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
8347        queue->dev = dev;
8348#ifdef CONFIG_BQL
8349        dql_init(&queue->dql, HZ);
8350#endif
8351}
8352
8353static void netif_free_tx_queues(struct net_device *dev)
8354{
8355        kvfree(dev->_tx);
8356}
8357
8358static int netif_alloc_netdev_queues(struct net_device *dev)
8359{
8360        unsigned int count = dev->num_tx_queues;
8361        struct netdev_queue *tx;
8362        size_t sz = count * sizeof(*tx);
8363
8364        if (count < 1 || count > 0xffff)
8365                return -EINVAL;
8366
8367        tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
8368        if (!tx)
8369                return -ENOMEM;
8370
8371        dev->_tx = tx;
8372
8373        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
8374        spin_lock_init(&dev->tx_global_lock);
8375
8376        return 0;
8377}
8378
8379void netif_tx_stop_all_queues(struct net_device *dev)
8380{
8381        unsigned int i;
8382
8383        for (i = 0; i < dev->num_tx_queues; i++) {
8384                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
8385
8386                netif_tx_stop_queue(txq);
8387        }
8388}
8389EXPORT_SYMBOL(netif_tx_stop_all_queues);
8390
8391/**
8392 *      register_netdevice      - register a network device
8393 *      @dev: device to register
8394 *
8395 *      Take a completed network device structure and add it to the kernel
8396 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
8397 *      chain. 0 is returned on success. A negative errno code is returned
8398 *      on a failure to set up the device, or if the name is a duplicate.
8399 *
8400 *      Callers must hold the rtnl semaphore. You may want
8401 *      register_netdev() instead of this.
8402 *
8403 *      BUGS:
8404 *      The locking appears insufficient to guarantee two parallel registers
8405 *      will not get the same name.
8406 */
8407
8408int register_netdevice(struct net_device *dev)
8409{
8410        int ret;
8411        struct net *net = dev_net(dev);
8412
8413        BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
8414                     NETDEV_FEATURE_COUNT);
8415        BUG_ON(dev_boot_phase);
8416        ASSERT_RTNL();
8417
8418        might_sleep();
8419
8420        /* When net_device's are persistent, this will be fatal. */
8421        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
8422        BUG_ON(!net);
8423
8424        spin_lock_init(&dev->addr_list_lock);
8425        netdev_set_addr_lockdep_class(dev);
8426
8427        ret = dev_get_valid_name(net, dev, dev->name);
8428        if (ret < 0)
8429                goto out;
8430
8431        /* Init, if this function is available */
8432        if (dev->netdev_ops->ndo_init) {
8433                ret = dev->netdev_ops->ndo_init(dev);
8434                if (ret) {
8435                        if (ret > 0)
8436                                ret = -EIO;
8437                        goto out;
8438                }
8439        }
8440
8441        if (((dev->hw_features | dev->features) &
8442             NETIF_F_HW_VLAN_CTAG_FILTER) &&
8443            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
8444             !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
8445                netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
8446                ret = -EINVAL;
8447                goto err_uninit;
8448        }
8449
8450        ret = -EBUSY;
8451        if (!dev->ifindex)
8452                dev->ifindex = dev_new_index(net);
8453        else if (__dev_get_by_index(net, dev->ifindex))
8454                goto err_uninit;
8455
8456        /* Transfer changeable features to wanted_features and enable
8457         * software offloads (GSO and GRO).
8458         */
8459        dev->hw_features |= NETIF_F_SOFT_FEATURES;
8460        dev->features |= NETIF_F_SOFT_FEATURES;
8461
8462        if (dev->netdev_ops->ndo_udp_tunnel_add) {
8463                dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
8464                dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
8465        }
8466
8467        dev->wanted_features = dev->features & dev->hw_features;
8468
8469        if (!(dev->flags & IFF_LOOPBACK))
8470                dev->hw_features |= NETIF_F_NOCACHE_COPY;
8471
8472        /* If IPv4 TCP segmentation offload is supported we should also
8473         * allow the device to enable segmenting the frame with the option
8474         * of ignoring a static IP ID value.  This doesn't enable the
8475         * feature itself but allows the user to enable it later.
8476         */
8477        if (dev->hw_features & NETIF_F_TSO)
8478                dev->hw_features |= NETIF_F_TSO_MANGLEID;
8479        if (dev->vlan_features & NETIF_F_TSO)
8480                dev->vlan_features |= NETIF_F_TSO_MANGLEID;
8481        if (dev->mpls_features & NETIF_F_TSO)
8482                dev->mpls_features |= NETIF_F_TSO_MANGLEID;
8483        if (dev->hw_enc_features & NETIF_F_TSO)
8484                dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
8485
8486        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
8487         */
8488        dev->vlan_features |= NETIF_F_HIGHDMA;
8489
8490        /* Make NETIF_F_SG inheritable to tunnel devices.
8491         */
8492        dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
8493
8494        /* Make NETIF_F_SG inheritable to MPLS.
8495         */
8496        dev->mpls_features |= NETIF_F_SG;
8497
8498        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
8499        ret = notifier_to_errno(ret);
8500        if (ret)
8501                goto err_uninit;
8502
8503        ret = netdev_register_kobject(dev);
8504        if (ret)
8505                goto err_uninit;
8506        dev->reg_state = NETREG_REGISTERED;
8507
8508        __netdev_update_features(dev);
8509
8510        /*
8511         *      Default initial state at registry is that the
8512         *      device is present.
8513         */
8514
8515        set_bit(__LINK_STATE_PRESENT, &dev->state);
8516
8517        linkwatch_init_dev(dev);
8518
8519        dev_init_scheduler(dev);
8520        dev_hold(dev);
8521        list_netdevice(dev);
8522        add_device_randomness(dev->dev_addr, dev->addr_len);
8523
8524        /* If the device has permanent device address, driver should
8525         * set dev_addr and also addr_assign_type should be set to
8526         * NET_ADDR_PERM (default value).
8527         */
8528        if (dev->addr_assign_type == NET_ADDR_PERM)
8529                memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
8530
8531        /* Notify protocols, that a new device appeared. */
8532        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
8533        ret = notifier_to_errno(ret);
8534        if (ret) {
8535                rollback_registered(dev);
8536                dev->reg_state = NETREG_UNREGISTERED;
8537        }
8538        /*
8539         *      Prevent userspace races by waiting until the network
8540         *      device is fully setup before sending notifications.
8541         */
8542        if (!dev->rtnl_link_ops ||
8543            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
8544                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8545
8546out:
8547        return ret;
8548
8549err_uninit:
8550        if (dev->netdev_ops->ndo_uninit)
8551                dev->netdev_ops->ndo_uninit(dev);
8552        if (dev->priv_destructor)
8553                dev->priv_destructor(dev);
8554        goto out;
8555}
8556EXPORT_SYMBOL(register_netdevice);
8557
8558/**
8559 *      init_dummy_netdev       - init a dummy network device for NAPI
8560 *      @dev: device to init
8561 *
8562 *      This takes a network device structure and initialize the minimum
8563 *      amount of fields so it can be used to schedule NAPI polls without
8564 *      registering a full blown interface. This is to be used by drivers
8565 *      that need to tie several hardware interfaces to a single NAPI
8566 *      poll scheduler due to HW limitations.
8567 */
8568int init_dummy_netdev(struct net_device *dev)
8569{
8570        /* Clear everything. Note we don't initialize spinlocks
8571         * are they aren't supposed to be taken by any of the
8572         * NAPI code and this dummy netdev is supposed to be
8573         * only ever used for NAPI polls
8574         */
8575        memset(dev, 0, sizeof(struct net_device));
8576
8577        /* make sure we BUG if trying to hit standard
8578         * register/unregister code path
8579         */
8580        dev->reg_state = NETREG_DUMMY;
8581
8582        /* NAPI wants this */
8583        INIT_LIST_HEAD(&dev->napi_list);
8584
8585        /* a dummy interface is started by default */
8586        set_bit(__LINK_STATE_PRESENT, &dev->state);
8587        set_bit(__LINK_STATE_START, &dev->state);
8588
8589        /* Note : We dont allocate pcpu_refcnt for dummy devices,
8590         * because users of this 'device' dont need to change
8591         * its refcount.
8592         */
8593
8594        return 0;
8595}
8596EXPORT_SYMBOL_GPL(init_dummy_netdev);
8597
8598
8599/**
8600 *      register_netdev - register a network device
8601 *      @dev: device to register
8602 *
8603 *      Take a completed network device structure and add it to the kernel
8604 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
8605 *      chain. 0 is returned on success. A negative errno code is returned
8606 *      on a failure to set up the device, or if the name is a duplicate.
8607 *
8608 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
8609 *      and expands the device name if you passed a format string to
8610 *      alloc_netdev.
8611 */
8612int register_netdev(struct net_device *dev)
8613{
8614        int err;
8615
8616        if (rtnl_lock_killable())
8617                return -EINTR;
8618        err = register_netdevice(dev);
8619        rtnl_unlock();
8620        return err;
8621}
8622EXPORT_SYMBOL(register_netdev);
8623
8624int netdev_refcnt_read(const struct net_device *dev)
8625{
8626        int i, refcnt = 0;
8627
8628        for_each_possible_cpu(i)
8629                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
8630        return refcnt;
8631}
8632EXPORT_SYMBOL(netdev_refcnt_read);
8633
8634/**
8635 * netdev_wait_allrefs - wait until all references are gone.
8636 * @dev: target net_device
8637 *
8638 * This is called when unregistering network devices.
8639 *
8640 * Any protocol or device that holds a reference should register
8641 * for netdevice notification, and cleanup and put back the
8642 * reference if they receive an UNREGISTER event.
8643 * We can get stuck here if buggy protocols don't correctly
8644 * call dev_put.
8645 */
8646static void netdev_wait_allrefs(struct net_device *dev)
8647{
8648        unsigned long rebroadcast_time, warning_time;
8649        int refcnt;
8650
8651        linkwatch_forget_dev(dev);
8652
8653        rebroadcast_time = warning_time = jiffies;
8654        refcnt = netdev_refcnt_read(dev);
8655
8656        while (refcnt != 0) {
8657                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
8658                        rtnl_lock();
8659
8660                        /* Rebroadcast unregister notification */
8661                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8662
8663                        __rtnl_unlock();
8664                        rcu_barrier();
8665                        rtnl_lock();
8666
8667                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
8668                                     &dev->state)) {
8669                                /* We must not have linkwatch events
8670                                 * pending on unregister. If this
8671                                 * happens, we simply run the queue
8672                                 * unscheduled, resulting in a noop
8673                                 * for this device.
8674                                 */
8675                                linkwatch_run_queue();
8676                        }
8677
8678                        __rtnl_unlock();
8679
8680                        rebroadcast_time = jiffies;
8681                }
8682
8683                msleep(250);
8684
8685                refcnt = netdev_refcnt_read(dev);
8686
8687                if (time_after(jiffies, warning_time + 10 * HZ)) {
8688                        pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
8689                                 dev->name, refcnt);
8690                        warning_time = jiffies;
8691                }
8692        }
8693}
8694
8695/* The sequence is:
8696 *
8697 *      rtnl_lock();
8698 *      ...
8699 *      register_netdevice(x1);
8700 *      register_netdevice(x2);
8701 *      ...
8702 *      unregister_netdevice(y1);
8703 *      unregister_netdevice(y2);
8704 *      ...
8705 *      rtnl_unlock();
8706 *      free_netdev(y1);
8707 *      free_netdev(y2);
8708 *
8709 * We are invoked by rtnl_unlock().
8710 * This allows us to deal with problems:
8711 * 1) We can delete sysfs objects which invoke hotplug
8712 *    without deadlocking with linkwatch via keventd.
8713 * 2) Since we run with the RTNL semaphore not held, we can sleep
8714 *    safely in order to wait for the netdev refcnt to drop to zero.
8715 *
8716 * We must not return until all unregister events added during
8717 * the interval the lock was held have been completed.
8718 */
8719void netdev_run_todo(void)
8720{
8721        struct list_head list;
8722
8723        /* Snapshot list, allow later requests */
8724        list_replace_init(&net_todo_list, &list);
8725
8726        __rtnl_unlock();
8727
8728
8729        /* Wait for rcu callbacks to finish before next phase */
8730        if (!list_empty(&list))
8731                rcu_barrier();
8732
8733        while (!list_empty(&list)) {
8734                struct net_device *dev
8735                        = list_first_entry(&list, struct net_device, todo_list);
8736                list_del(&dev->todo_list);
8737
8738                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
8739                        pr_err("network todo '%s' but state %d\n",
8740                               dev->name, dev->reg_state);
8741                        dump_stack();
8742                        continue;
8743                }
8744
8745                dev->reg_state = NETREG_UNREGISTERED;
8746
8747                netdev_wait_allrefs(dev);
8748
8749                /* paranoia */
8750                BUG_ON(netdev_refcnt_read(dev));
8751                BUG_ON(!list_empty(&dev->ptype_all));
8752                BUG_ON(!list_empty(&dev->ptype_specific));
8753                WARN_ON(rcu_access_pointer(dev->ip_ptr));
8754                WARN_ON(rcu_access_pointer(dev->ip6_ptr));
8755#if IS_ENABLED(CONFIG_DECNET)
8756                WARN_ON(dev->dn_ptr);
8757#endif
8758                if (dev->priv_destructor)
8759                        dev->priv_destructor(dev);
8760                if (dev->needs_free_netdev)
8761                        free_netdev(dev);
8762
8763                /* Report a network device has been unregistered */
8764                rtnl_lock();
8765                dev_net(dev)->dev_unreg_count--;
8766                __rtnl_unlock();
8767                wake_up(&netdev_unregistering_wq);
8768
8769                /* Free network device */
8770                kobject_put(&dev->dev.kobj);
8771        }
8772}
8773
8774/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
8775 * all the same fields in the same order as net_device_stats, with only
8776 * the type differing, but rtnl_link_stats64 may have additional fields
8777 * at the end for newer counters.
8778 */
8779void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
8780                             const struct net_device_stats *netdev_stats)
8781{
8782#if BITS_PER_LONG == 64
8783        BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
8784        memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
8785        /* zero out counters that only exist in rtnl_link_stats64 */
8786        memset((char *)stats64 + sizeof(*netdev_stats), 0,
8787               sizeof(*stats64) - sizeof(*netdev_stats));
8788#else
8789        size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
8790        const unsigned long *src = (const unsigned long *)netdev_stats;
8791        u64 *dst = (u64 *)stats64;
8792
8793        BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
8794        for (i = 0; i < n; i++)
8795                dst[i] = src[i];
8796        /* zero out counters that only exist in rtnl_link_stats64 */
8797        memset((char *)stats64 + n * sizeof(u64), 0,
8798               sizeof(*stats64) - n * sizeof(u64));
8799#endif
8800}
8801EXPORT_SYMBOL(netdev_stats_to_stats64);
8802
8803/**
8804 *      dev_get_stats   - get network device statistics
8805 *      @dev: device to get statistics from
8806 *      @storage: place to store stats
8807 *
8808 *      Get network statistics from device. Return @storage.
8809 *      The device driver may provide its own method by setting
8810 *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
8811 *      otherwise the internal statistics structure is used.
8812 */
8813struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
8814                                        struct rtnl_link_stats64 *storage)
8815{
8816        const struct net_device_ops *ops = dev->netdev_ops;
8817
8818        if (ops->ndo_get_stats64) {
8819                memset(storage, 0, sizeof(*storage));
8820                ops->ndo_get_stats64(dev, storage);
8821        } else if (ops->ndo_get_stats) {
8822                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
8823        } else {
8824                netdev_stats_to_stats64(storage, &dev->stats);
8825        }
8826        storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
8827        storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
8828        storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
8829        return storage;
8830}
8831EXPORT_SYMBOL(dev_get_stats);
8832
8833struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
8834{
8835        struct netdev_queue *queue = dev_ingress_queue(dev);
8836
8837#ifdef CONFIG_NET_CLS_ACT
8838        if (queue)
8839                return queue;
8840        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
8841        if (!queue)
8842                return NULL;
8843        netdev_init_one_queue(dev, queue, NULL);
8844        RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
8845        queue->qdisc_sleeping = &noop_qdisc;
8846        rcu_assign_pointer(dev->ingress_queue, queue);
8847#endif
8848        return queue;
8849}
8850
8851static const struct ethtool_ops default_ethtool_ops;
8852
8853void netdev_set_default_ethtool_ops(struct net_device *dev,
8854                                    const struct ethtool_ops *ops)
8855{
8856        if (dev->ethtool_ops == &default_ethtool_ops)
8857                dev->ethtool_ops = ops;
8858}
8859EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
8860
8861void netdev_freemem(struct net_device *dev)
8862{
8863        char *addr = (char *)dev - dev->padded;
8864
8865        kvfree(addr);
8866}
8867
8868/**
8869 * alloc_netdev_mqs - allocate network device
8870 * @sizeof_priv: size of private data to allocate space for
8871 * @name: device name format string
8872 * @name_assign_type: origin of device name
8873 * @setup: callback to initialize device
8874 * @txqs: the number of TX subqueues to allocate
8875 * @rxqs: the number of RX subqueues to allocate
8876 *
8877 * Allocates a struct net_device with private data area for driver use
8878 * and performs basic initialization.  Also allocates subqueue structs
8879 * for each queue on the device.
8880 */
8881struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
8882                unsigned char name_assign_type,
8883                void (*setup)(struct net_device *),
8884                unsigned int txqs, unsigned int rxqs)
8885{
8886        struct net_device *dev;
8887        unsigned int alloc_size;
8888        struct net_device *p;
8889
8890        BUG_ON(strlen(name) >= sizeof(dev->name));
8891
8892        if (txqs < 1) {
8893                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
8894                return NULL;
8895        }
8896
8897        if (rxqs < 1) {
8898                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
8899                return NULL;
8900        }
8901
8902        alloc_size = sizeof(struct net_device);
8903        if (sizeof_priv) {
8904                /* ensure 32-byte alignment of private area */
8905                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
8906                alloc_size += sizeof_priv;
8907        }
8908        /* ensure 32-byte alignment of whole construct */
8909        alloc_size += NETDEV_ALIGN - 1;
8910
8911        p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
8912        if (!p)
8913                return NULL;
8914
8915        dev = PTR_ALIGN(p, NETDEV_ALIGN);
8916        dev->padded = (char *)dev - (char *)p;
8917
8918        dev->pcpu_refcnt = alloc_percpu(int);
8919        if (!dev->pcpu_refcnt)
8920                goto free_dev;
8921
8922        if (dev_addr_init(dev))
8923                goto free_pcpu;
8924
8925        dev_mc_init(dev);
8926        dev_uc_init(dev);
8927
8928        dev_net_set(dev, &init_net);
8929
8930        dev->gso_max_size = GSO_MAX_SIZE;
8931        dev->gso_max_segs = GSO_MAX_SEGS;
8932
8933        INIT_LIST_HEAD(&dev->napi_list);
8934        INIT_LIST_HEAD(&dev->unreg_list);
8935        INIT_LIST_HEAD(&dev->close_list);
8936        INIT_LIST_HEAD(&dev->link_watch_list);
8937        INIT_LIST_HEAD(&dev->adj_list.upper);
8938        INIT_LIST_HEAD(&dev->adj_list.lower);
8939        INIT_LIST_HEAD(&dev->ptype_all);
8940        INIT_LIST_HEAD(&dev->ptype_specific);
8941#ifdef CONFIG_NET_SCHED
8942        hash_init(dev->qdisc_hash);
8943#endif
8944        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
8945        setup(dev);
8946
8947        if (!dev->tx_queue_len) {
8948                dev->priv_flags |= IFF_NO_QUEUE;
8949                dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
8950        }
8951
8952        dev->num_tx_queues = txqs;
8953        dev->real_num_tx_queues = txqs;
8954        if (netif_alloc_netdev_queues(dev))
8955                goto free_all;
8956
8957        dev->num_rx_queues = rxqs;
8958        dev->real_num_rx_queues = rxqs;
8959        if (netif_alloc_rx_queues(dev))
8960                goto free_all;
8961
8962        strcpy(dev->name, name);
8963        dev->name_assign_type = name_assign_type;
8964        dev->group = INIT_NETDEV_GROUP;
8965        if (!dev->ethtool_ops)
8966                dev->ethtool_ops = &default_ethtool_ops;
8967
8968        nf_hook_ingress_init(dev);
8969
8970        return dev;
8971
8972free_all:
8973        free_netdev(dev);
8974        return NULL;
8975
8976free_pcpu:
8977        free_percpu(dev->pcpu_refcnt);
8978free_dev:
8979        netdev_freemem(dev);
8980        return NULL;
8981}
8982EXPORT_SYMBOL(alloc_netdev_mqs);
8983
8984/**
8985 * free_netdev - free network device
8986 * @dev: device
8987 *
8988 * This function does the last stage of destroying an allocated device
8989 * interface. The reference to the device object is released. If this
8990 * is the last reference then it will be freed.Must be called in process
8991 * context.
8992 */
8993void free_netdev(struct net_device *dev)
8994{
8995        struct napi_struct *p, *n;
8996
8997        might_sleep();
8998        netif_free_tx_queues(dev);
8999        netif_free_rx_queues(dev);
9000

9001        kfree(rcu_dereference_protected(dev->ingress_queue, 1));
9002
9003        /* Flush device addresses */
9004        dev_addr_flush(dev);
9005
9006        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
9007                netif_napi_del(p);
9008
9009        free_percpu(dev->pcpu_refcnt);
9010        dev->pcpu_refcnt = NULL;
9011
9012        /*  Compatibility with error handling in drivers */
9013        if (dev->reg_state == NETREG_UNINITIALIZED) {
9014                netdev_freemem(dev);
9015                return;
9016        }
9017
9018        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
9019        dev->reg_state = NETREG_RELEASED;
9020
9021        /* will free via device release */
9022        put_device(&dev->dev);
9023}
9024EXPORT_SYMBOL(free_netdev);
9025
9026/**
9027 *      synchronize_net -  Synchronize with packet receive processing
9028 *
9029 *      Wait for packets currently being received to be done.
9030 *      Does not block later packets from starting.
9031 */
9032void synchronize_net(void)
9033{
9034        might_sleep();
9035        if (rtnl_is_locked())
9036                synchronize_rcu_expedited();
9037        else
9038                synchronize_rcu();
9039}
9040EXPORT_SYMBOL(synchronize_net);
9041
9042/**
9043 *      unregister_netdevice_queue - remove device from the kernel
9044 *      @dev: device
9045 *      @head: list
9046 *
9047 *      This function shuts down a device interface and removes it
9048 *      from the kernel tables.
9049 *      If head not NULL, device is queued to be unregistered later.
9050 *
9051 *      Callers must hold the rtnl semaphore.  You may want
9052 *      unregister_netdev() instead of this.
9053 */
9054
9055void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
9056{
9057        ASSERT_RTNL();
9058
9059        if (head) {
9060                list_move_tail(&dev->unreg_list, head);
9061        } else {
9062                rollback_registered(dev);
9063                /* Finish processing unregister after unlock */
9064                net_set_todo(dev);
9065        }
9066}
9067EXPORT_SYMBOL(unregister_netdevice_queue);
9068
9069/**
9070 *      unregister_netdevice_many - unregister many devices
9071 *      @head: list of devices
9072 *
9073 *  Note: As most callers use a stack allocated list_head,
9074 *  we force a list_del() to make sure stack wont be corrupted later.
9075 */
9076void unregister_netdevice_many(struct list_head *head)
9077{
9078        struct net_device *dev;
9079
9080        if (!list_empty(head)) {
9081                rollback_registered_many(head);
9082                list_for_each_entry(dev, head, unreg_list)
9083                        net_set_todo(dev);
9084                list_del(head);
9085        }
9086}
9087EXPORT_SYMBOL(unregister_netdevice_many);
9088
9089/**
9090 *      unregister_netdev - remove device from the kernel
9091 *      @dev: device
9092 *
9093 *      This function shuts down a device interface and removes it
9094 *      from the kernel tables.
9095 *
9096 *      This is just a wrapper for unregister_netdevice that takes
9097 *      the rtnl semaphore.  In general you want to use this and not
9098 *      unregister_netdevice.
9099 */
9100void unregister_netdev(struct net_device *dev)
9101{
9102        rtnl_lock();
9103        unregister_netdevice(dev);
9104        rtnl_unlock();
9105}
9106EXPORT_SYMBOL(unregister_netdev);
9107
9108/**
9109 *      dev_change_net_namespace - move device to different nethost namespace
9110 *      @dev: device
9111 *      @net: network namespace
9112 *      @pat: If not NULL name pattern to try if the current device name
9113 *            is already taken in the destination network namespace.
9114 *
9115 *      This function shuts down a device interface and moves it
9116 *      to a new network namespace. On success 0 is returned, on
9117 *      a failure a netagive errno code is returned.
9118 *
9119 *      Callers must hold the rtnl semaphore.
9120 */
9121
9122int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
9123{
9124        int err, new_nsid, new_ifindex;
9125
9126        ASSERT_RTNL();
9127
9128        /* Don't allow namespace local devices to be moved. */
9129        err = -EINVAL;
9130        if (dev->features & NETIF_F_NETNS_LOCAL)
9131                goto out;
9132
9133        /* Ensure the device has been registrered */
9134        if (dev->reg_state != NETREG_REGISTERED)
9135                goto out;
9136
9137        /* Get out if there is nothing todo */
9138        err = 0;
9139        if (net_eq(dev_net(dev), net))
9140                goto out;
9141
9142        /* Pick the destination device name, and ensure
9143         * we can use it in the destination network namespace.
9144         */
9145        err = -EEXIST;
9146        if (__dev_get_by_name(net, dev->name)) {
9147                /* We get here if we can't use the current device name */
9148                if (!pat)
9149                        goto out;
9150                err = dev_get_valid_name(net, dev, pat);
9151                if (err < 0)
9152                        goto out;
9153        }
9154
9155        /*
9156         * And now a mini version of register_netdevice unregister_netdevice.
9157         */
9158
9159        /* If device is running close it first. */
9160        dev_close(dev);
9161
9162        /* And unlink it from device chain */
9163        unlist_netdevice(dev);
9164
9165        synchronize_net();
9166
9167        /* Shutdown queueing discipline. */
9168        dev_shutdown(dev);
9169
9170        /* Notify protocols, that we are about to destroy
9171         * this device. They should clean all the things.
9172         *
9173         * Note that dev->reg_state stays at NETREG_REGISTERED.
9174         * This is wanted because this way 8021q and macvlan know
9175         * the device is just moving and can keep their slaves up.
9176         */
9177        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
9178        rcu_barrier();
9179
9180        new_nsid = peernet2id_alloc(dev_net(dev), net);
9181        /* If there is an ifindex conflict assign a new one */
9182        if (__dev_get_by_index(net, dev->ifindex))
9183                new_ifindex = dev_new_index(net);
9184        else
9185                new_ifindex = dev->ifindex;
9186
9187        rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
9188                            new_ifindex);
9189
9190        /*
9191         *      Flush the unicast and multicast chains
9192         */
9193        dev_uc_flush(dev);
9194        dev_mc_flush(dev);
9195
9196        /* Send a netdev-removed uevent to the old namespace */
9197        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
9198        netdev_adjacent_del_links(dev);
9199
9200        /* Actually switch the network namespace */
9201        dev_net_set(dev, net);
9202        dev->ifindex = new_ifindex;
9203
9204        /* Send a netdev-add uevent to the new namespace */
9205        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
9206        netdev_adjacent_add_links(dev);
9207
9208        /* Fixup kobjects */
9209        err = device_rename(&dev->dev, dev->name);
9210        WARN_ON(err);
9211
9212        /* Add the device back in the hashes */
9213        list_netdevice(dev);
9214
9215        /* Notify protocols, that a new device appeared. */
9216        call_netdevice_notifiers(NETDEV_REGISTER, dev);
9217
9218        /*
9219         *      Prevent userspace races by waiting until the network
9220         *      device is fully setup before sending notifications.
9221         */
9222        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
9223
9224        synchronize_net();
9225        err = 0;
9226out:
9227        return err;
9228}
9229EXPORT_SYMBOL_GPL(dev_change_net_namespace);
9230
9231static int dev_cpu_dead(unsigned int oldcpu)
9232{
9233        struct sk_buff **list_skb;
9234        struct sk_buff *skb;
9235        unsigned int cpu;
9236        struct softnet_data *sd, *oldsd, *remsd = NULL;
9237
9238        local_irq_disable();
9239        cpu = smp_processor_id();
9240        sd = &per_cpu(softnet_data, cpu);
9241        oldsd = &per_cpu(softnet_data, oldcpu);
9242
9243        /* Find end of our completion_queue. */
9244        list_skb = &sd->completion_queue;
9245        while (*list_skb)
9246                list_skb = &(*list_skb)->next;
9247        /* Append completion queue from offline CPU. */
9248        *list_skb = oldsd->completion_queue;
9249        oldsd->completion_queue = NULL;
9250
9251        /* Append output queue from offline CPU. */
9252        if (oldsd->output_queue) {
9253                *sd->output_queue_tailp = oldsd->output_queue;
9254                sd->output_queue_tailp = oldsd->output_queue_tailp;
9255                oldsd->output_queue = NULL;
9256                oldsd->output_queue_tailp = &oldsd->output_queue;
9257        }
9258        /* Append NAPI poll list from offline CPU, with one exception :
9259         * process_backlog() must be called by cpu owning percpu backlog.
9260         * We properly handle process_queue & input_pkt_queue later.
9261         */
9262        while (!list_empty(&oldsd->poll_list)) {
9263                struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
9264                                                            struct napi_struct,
9265                                                            poll_list);
9266
9267                list_del_init(&napi->poll_list);
9268                if (napi->poll == process_backlog)
9269                        napi->state = 0;
9270                else
9271                        ____napi_schedule(sd, napi);
9272        }
9273
9274        raise_softirq_irqoff(NET_TX_SOFTIRQ);
9275        local_irq_enable();
9276
9277#ifdef CONFIG_RPS
9278        remsd = oldsd->rps_ipi_list;
9279        oldsd->rps_ipi_list = NULL;
9280#endif
9281        /* send out pending IPI's on offline CPU */
9282        net_rps_send_ipi(remsd);
9283
9284        /* Process offline CPU's input_pkt_queue */
9285        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
9286                netif_rx_ni(skb);
9287                input_queue_head_incr(oldsd);
9288        }
9289        while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
9290                netif_rx_ni(skb);
9291                input_queue_head_incr(oldsd);
9292        }
9293
9294        return 0;
9295}
9296
9297/**
9298 *      netdev_increment_features - increment feature set by one
9299 *      @all: current feature set
9300 *      @one: new feature set
9301 *      @mask: mask feature set
9302 *
9303 *      Computes a new feature set after adding a device with feature set
9304 *      @one to the master device with current feature set @all.  Will not
9305 *      enable anything that is off in @mask. Returns the new feature set.
9306 */
9307netdev_features_t netdev_increment_features(netdev_features_t all,
9308        netdev_features_t one, netdev_features_t mask)
9309{
9310        if (mask & NETIF_F_HW_CSUM)
9311                mask |= NETIF_F_CSUM_MASK;
9312        mask |= NETIF_F_VLAN_CHALLENGED;
9313
9314        all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
9315        all &= one | ~NETIF_F_ALL_FOR_ALL;
9316
9317        /* If one device supports hw checksumming, set for all. */
9318        if (all & NETIF_F_HW_CSUM)
9319                all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
9320
9321        return all;
9322}
9323EXPORT_SYMBOL(netdev_increment_features);
9324
9325static struct hlist_head * __net_init netdev_create_hash(void)
9326{
9327        int i;
9328        struct hlist_head *hash;
9329
9330        hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
9331        if (hash != NULL)
9332                for (i = 0; i < NETDEV_HASHENTRIES; i++)
9333                        INIT_HLIST_HEAD(&hash[i]);
9334
9335        return hash;
9336}
9337
9338/* Initialize per network namespace state */
9339static int __net_init netdev_init(struct net *net)
9340{
9341        BUILD_BUG_ON(GRO_HASH_BUCKETS >
9342                     8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
9343
9344        if (net != &init_net)
9345                INIT_LIST_HEAD(&net->dev_base_head);
9346
9347        net->dev_name_head = netdev_create_hash();
9348        if (net->dev_name_head == NULL)
9349                goto err_name;
9350
9351        net->dev_index_head = netdev_create_hash();
9352        if (net->dev_index_head == NULL)
9353                goto err_idx;
9354
9355        return 0;
9356
9357err_idx:
9358        kfree(net->dev_name_head);
9359err_name:
9360        return -ENOMEM;
9361}
9362
9363/**
9364 *      netdev_drivername - network driver for the device
9365 *      @dev: network device
9366 *
9367 *      Determine network driver for device.
9368 */
9369const char *netdev_drivername(const struct net_device *dev)
9370{
9371        const struct device_driver *driver;
9372        const struct device *parent;
9373        const char *empty = "";
9374
9375        parent = dev->dev.parent;
9376        if (!parent)
9377                return empty;
9378
9379        driver = parent->driver;
9380        if (driver && driver->name)
9381                return driver->name;
9382        return empty;
9383}
9384
9385static void __netdev_printk(const char *level, const struct net_device *dev,
9386                            struct va_format *vaf)
9387{
9388        if (dev && dev->dev.parent) {
9389                dev_printk_emit(level[1] - '0',
9390                                dev->dev.parent,
9391                                "%s %s %s%s: %pV",
9392                                dev_driver_string(dev->dev.parent),
9393                                dev_name(dev->dev.parent),
9394                                netdev_name(dev), netdev_reg_state(dev),
9395                                vaf);
9396        } else if (dev) {
9397                printk("%s%s%s: %pV",
9398                       level, netdev_name(dev), netdev_reg_state(dev), vaf);
9399        } else {
9400                printk("%s(NULL net_device): %pV", level, vaf);
9401        }
9402}
9403
9404void netdev_printk(const char *level, const struct net_device *dev,
9405                   const char *format, ...)
9406{
9407        struct va_format vaf;
9408        va_list args;
9409
9410        va_start(args, format);
9411
9412        vaf.fmt = format;
9413        vaf.va = &args;
9414
9415        __netdev_printk(level, dev, &vaf);
9416
9417        va_end(args);
9418}
9419EXPORT_SYMBOL(netdev_printk);
9420
9421#define define_netdev_printk_level(func, level)                 \
9422void func(const struct net_device *dev, const char *fmt, ...)   \
9423{                                                               \
9424        struct va_format vaf;                                   \
9425        va_list args;                                           \
9426                                                                \
9427        va_start(args, fmt);                                    \
9428                                                                \
9429        vaf.fmt = fmt;                                          \
9430        vaf.va = &args;                                         \
9431                                                                \
9432        __netdev_printk(level, dev, &vaf);                      \
9433                                                                \
9434        va_end(args);                                           \
9435}                                                               \
9436EXPORT_SYMBOL(func);
9437
9438define_netdev_printk_level(netdev_emerg, KERN_EMERG);
9439define_netdev_printk_level(netdev_alert, KERN_ALERT);
9440define_netdev_printk_level(netdev_crit, KERN_CRIT);
9441define_netdev_printk_level(netdev_err, KERN_ERR);
9442define_netdev_printk_level(netdev_warn, KERN_WARNING);
9443define_netdev_printk_level(netdev_notice, KERN_NOTICE);
9444define_netdev_printk_level(netdev_info, KERN_INFO);
9445
9446static void __net_exit netdev_exit(struct net *net)
9447{
9448        kfree(net->dev_name_head);
9449        kfree(net->dev_index_head);
9450        if (net != &init_net)
9451                WARN_ON_ONCE(!list_empty(&net->dev_base_head));
9452}
9453
9454static struct pernet_operations __net_initdata netdev_net_ops = {
9455        .init = netdev_init,
9456        .exit = netdev_exit,
9457};
9458
9459static void __net_exit default_device_exit(struct net *net)
9460{
9461        struct net_device *dev, *aux;
9462        /*
9463         * Push all migratable network devices back to the
9464         * initial network namespace
9465         */
9466        rtnl_lock();
9467        for_each_netdev_safe(net, dev, aux) {
9468                int err;
9469                char fb_name[IFNAMSIZ];
9470
9471                /* Ignore unmoveable devices (i.e. loopback) */
9472                if (dev->features & NETIF_F_NETNS_LOCAL)
9473                        continue;
9474
9475                /* Leave virtual devices for the generic cleanup */
9476                if (dev->rtnl_link_ops)
9477                        continue;
9478
9479                /* Push remaining network devices to init_net */
9480                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
9481                err = dev_change_net_namespace(dev, &init_net, fb_name);
9482                if (err) {
9483                        pr_emerg("%s: failed to move %s to init_net: %d\n",
9484                                 __func__, dev->name, err);
9485                        BUG();
9486                }
9487        }
9488        rtnl_unlock();
9489}
9490
9491static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
9492{
9493        /* Return with the rtnl_lock held when there are no network
9494         * devices unregistering in any network namespace in net_list.
9495         */
9496        struct net *net;
9497        bool unregistering;
9498        DEFINE_WAIT_FUNC(wait, woken_wake_function);
9499
9500        add_wait_queue(&netdev_unregistering_wq, &wait);
9501        for (;;) {
9502                unregistering = false;
9503                rtnl_lock();
9504                list_for_each_entry(net, net_list, exit_list) {
9505                        if (net->dev_unreg_count > 0) {
9506                                unregistering = true;
9507                                break;
9508                        }
9509                }
9510                if (!unregistering)
9511                        break;
9512                __rtnl_unlock();
9513
9514                wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
9515        }
9516        remove_wait_queue(&netdev_unregistering_wq, &wait);
9517}
9518
9519static void __net_exit default_device_exit_batch(struct list_head *net_list)
9520{
9521        /* At exit all network devices most be removed from a network
9522         * namespace.  Do this in the reverse order of registration.
9523         * Do this across as many network namespaces as possible to
9524         * improve batching efficiency.
9525         */
9526        struct net_device *dev;
9527        struct net *net;
9528        LIST_HEAD(dev_kill_list);
9529
9530        /* To prevent network device cleanup code from dereferencing
9531         * loopback devices or network devices that have been freed
9532         * wait here for all pending unregistrations to complete,
9533         * before unregistring the loopback device and allowing the
9534         * network namespace be freed.
9535         *
9536         * The netdev todo list containing all network devices
9537         * unregistrations that happen in default_device_exit_batch
9538         * will run in the rtnl_unlock() at the end of
9539         * default_device_exit_batch.
9540         */
9541        rtnl_lock_unregistering(net_list);
9542        list_for_each_entry(net, net_list, exit_list) {
9543                for_each_netdev_reverse(net, dev) {
9544                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
9545                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
9546                        else
9547                                unregister_netdevice_queue(dev, &dev_kill_list);
9548                }
9549        }
9550        unregister_netdevice_many(&dev_kill_list);
9551        rtnl_unlock();
9552}
9553
9554static struct pernet_operations __net_initdata default_device_ops = {
9555        .exit = default_device_exit,
9556        .exit_batch = default_device_exit_batch,
9557};
9558
9559/*
9560 *      Initialize the DEV module. At boot time this walks the device list and
9561 *      unhooks any devices that fail to initialise (normally hardware not
9562 *      present) and leaves us with a valid list of present and active devices.
9563 *
9564 */
9565
9566/*
9567 *       This is called single threaded during boot, so no need
9568 *       to take the rtnl semaphore.
9569 */
9570static int __init net_dev_init(void)
9571{
9572        int i, rc = -ENOMEM;
9573
9574        BUG_ON(!dev_boot_phase);
9575
9576        if (dev_proc_init())
9577                goto out;
9578
9579        if (netdev_kobject_init())
9580                goto out;
9581
9582        INIT_LIST_HEAD(&ptype_all);
9583        for (i = 0; i < PTYPE_HASH_SIZE; i++)
9584                INIT_LIST_HEAD(&ptype_base[i]);
9585
9586        INIT_LIST_HEAD(&offload_base);
9587
9588        if (register_pernet_subsys(&netdev_net_ops))
9589                goto out;
9590
9591        /*
9592         *      Initialise the packet receive queues.
9593         */
9594
9595        for_each_possible_cpu(i) {
9596                struct work_struct *flush = per_cpu_ptr(&flush_works, i);
9597                struct softnet_data *sd = &per_cpu(softnet_data, i);
9598
9599                INIT_WORK(flush, flush_backlog);
9600
9601                skb_queue_head_init(&sd->input_pkt_queue);
9602                skb_queue_head_init(&sd->process_queue);
9603#ifdef CONFIG_XFRM_OFFLOAD
9604                skb_queue_head_init(&sd->xfrm_backlog);
9605#endif
9606                INIT_LIST_HEAD(&sd->poll_list);
9607                sd->output_queue_tailp = &sd->output_queue;
9608#ifdef CONFIG_RPS
9609                sd->csd.func = rps_trigger_softirq;
9610                sd->csd.info = sd;
9611                sd->cpu = i;
9612#endif
9613
9614                init_gro_hash(&sd->backlog);
9615                sd->backlog.poll = process_backlog;
9616                sd->backlog.weight = weight_p;
9617        }
9618
9619        dev_boot_phase = 0;
9620
9621        /* The loopback device is special if any other network devices
9622         * is present in a network namespace the loopback device must
9623         * be present. Since we now dynamically allocate and free the
9624         * loopback device ensure this invariant is maintained by
9625         * keeping the loopback device as the first device on the
9626         * list of network devices.  Ensuring the loopback devices
9627         * is the first device that appears and the last network device
9628         * that disappears.
9629         */
9630        if (register_pernet_device(&loopback_net_ops))
9631                goto out;
9632
9633        if (register_pernet_device(&default_device_ops))
9634                goto out;
9635
9636        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
9637        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
9638
9639        rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
9640                                       NULL, dev_cpu_dead);
9641        WARN_ON(rc < 0);
9642        rc = 0;
9643out:
9644        return rc;
9645}
9646
9647subsys_initcall(net_dev_init);
9648