LXR linux/net/core/dev.c

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *      NET3    Protocol independent device support routines.
   4 *
   5 *      Derived from the non IP parts of dev.c 1.0.19
   6 *              Authors:        Ross Biro
   7 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
   8 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
   9 *
  10 *      Additional Authors:
  11 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  12 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  13 *              David Hinds <dahinds@users.sourceforge.net>
  14 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  15 *              Adam Sulmicki <adam@cfar.umd.edu>
  16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  17 *
  18 *      Changes:
  19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  20 *                                      to 2 if register_netdev gets called
  21 *                                      before net_dev_init & also removed a
  22 *                                      few lines of code in the process.
  23 *              Alan Cox        :       device private ioctl copies fields back.
  24 *              Alan Cox        :       Transmit queue code does relevant
  25 *                                      stunts to keep the queue safe.
  26 *              Alan Cox        :       Fixed double lock.
  27 *              Alan Cox        :       Fixed promisc NULL pointer trap
  28 *              ????????        :       Support the full private ioctl range
  29 *              Alan Cox        :       Moved ioctl permission check into
  30 *                                      drivers
  31 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  32 *              Alan Cox        :       100 backlog just doesn't cut it when
  33 *                                      you start doing multicast video 8)
  34 *              Alan Cox        :       Rewrote net_bh and list manager.
  35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  36 *              Alan Cox        :       Took out transmit every packet pass
  37 *                                      Saved a few bytes in the ioctl handler
  38 *              Alan Cox        :       Network driver sets packet type before
  39 *                                      calling netif_rx. Saves a function
  40 *                                      call a packet.
  41 *              Alan Cox        :       Hashed net_bh()
  42 *              Richard Kooijman:       Timestamp fixes.
  43 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  44 *              Alan Cox        :       Device lock protection.
  45 *              Alan Cox        :       Fixed nasty side effect of device close
  46 *                                      changes.
  47 *              Rudi Cilibrasi  :       Pass the right thing to
  48 *                                      set_mac_address()
  49 *              Dave Miller     :       32bit quantity for the device lock to
  50 *                                      make it work out on a Sparc.
  51 *              Bjorn Ekwall    :       Added KERNELD hack.
  52 *              Alan Cox        :       Cleaned up the backlog initialise.
  53 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  54 *                                      1 device.
  55 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  56 *                                      is no device open function.
  57 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  58 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  59 *              Cyrus Durgin    :       Cleaned for KMOD
  60 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  61 *                                      A network device unload needs to purge
  62 *                                      the backlog queue.
  63 *      Paul Rusty Russell      :       SIOCSIFNAME
  64 *              Pekka Riikonen  :       Netdev boot-time settings code
  65 *              Andrew Morton   :       Make unregister_netdevice wait
  66 *                                      indefinitely on dev->refcnt
  67 *              J Hadi Salim    :       - Backlog queue sampling
  68 *                                      - netif_rx() feedback
  69 */
  70
  71#include <linux/uaccess.h>
  72#include <linux/bitops.h>
  73#include <linux/capability.h>
  74#include <linux/cpu.h>
  75#include <linux/types.h>
  76#include <linux/kernel.h>
  77#include <linux/hash.h>
  78#include <linux/slab.h>
  79#include <linux/sched.h>
  80#include <linux/sched/mm.h>
  81#include <linux/mutex.h>
  82#include <linux/rwsem.h>
  83#include <linux/string.h>
  84#include <linux/mm.h>
  85#include <linux/socket.h>
  86#include <linux/sockios.h>
  87#include <linux/errno.h>
  88#include <linux/interrupt.h>
  89#include <linux/if_ether.h>
  90#include <linux/netdevice.h>
  91#include <linux/etherdevice.h>
  92#include <linux/ethtool.h>
  93#include <linux/skbuff.h>
  94#include <linux/bpf.h>
  95#include <linux/bpf_trace.h>
  96#include <net/net_namespace.h>
  97#include <net/sock.h>
  98#include <net/busy_poll.h>
  99#include <linux/rtnetlink.h>
 100#include <linux/stat.h>
 101#include <net/dst.h>
 102#include <net/dst_metadata.h>
 103#include <net/pkt_sched.h>
 104#include <net/pkt_cls.h>
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/module.h>
 110#include <linux/netpoll.h>
 111#include <linux/rcupdate.h>
 112#include <linux/delay.h>
 113#include <net/iw_handler.h>
 114#include <asm/current.h>
 115#include <linux/audit.h>
 116#include <linux/dmaengine.h>
 117#include <linux/err.h>
 118#include <linux/ctype.h>
 119#include <linux/if_arp.h>
 120#include <linux/if_vlan.h>
 121#include <linux/ip.h>
 122#include <net/ip.h>
 123#include <net/mpls.h>
 124#include <linux/ipv6.h>
 125#include <linux/in.h>
 126#include <linux/jhash.h>
 127#include <linux/random.h>
 128#include <trace/events/napi.h>
 129#include <trace/events/net.h>
 130#include <trace/events/skb.h>
 131#include <linux/inetdevice.h>
 132#include <linux/cpu_rmap.h>
 133#include <linux/static_key.h>
 134#include <linux/hashtable.h>
 135#include <linux/vmalloc.h>
 136#include <linux/if_macvlan.h>
 137#include <linux/errqueue.h>
 138#include <linux/hrtimer.h>
 139#include <linux/netfilter_ingress.h>
 140#include <linux/crash_dump.h>
 141#include <linux/sctp.h>
 142#include <net/udp_tunnel.h>
 143#include <linux/net_namespace.h>
 144#include <linux/indirect_call_wrapper.h>
 145#include <net/devlink.h>
 146#include <linux/pm_runtime.h>
 147
 148#include "net-sysfs.h"
 149
 150#define MAX_GRO_SKBS 8
 151
 152/* This should be increased if a protocol with a bigger head is added. */
 153#define GRO_MAX_HEAD (MAX_HEADER + 128)
 154
 155static DEFINE_SPINLOCK(ptype_lock);
 156static DEFINE_SPINLOCK(offload_lock);
 157struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 158struct list_head ptype_all __read_mostly;       /* Taps */
 159static struct list_head offload_base __read_mostly;
 160
 161static int netif_rx_internal(struct sk_buff *skb);
 162static int call_netdevice_notifiers_info(unsigned long val,
 163                                         struct netdev_notifier_info *info);
 164static int call_netdevice_notifiers_extack(unsigned long val,
 165                                           struct net_device *dev,
 166                                           struct netlink_ext_ack *extack);
 167static struct napi_struct *napi_by_id(unsigned int napi_id);
 168
 169/*
 170 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 171 * semaphore.
 172 *
 173 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 174 *
 175 * Writers must hold the rtnl semaphore while they loop through the
 176 * dev_base_head list, and hold dev_base_lock for writing when they do the
 177 * actual updates.  This allows pure readers to access the list even
 178 * while a writer is preparing to update it.
 179 *
 180 * To put it another way, dev_base_lock is held for writing only to
 181 * protect against pure readers; the rtnl semaphore provides the
 182 * protection against other writers.
 183 *
 184 * See, for example usages, register_netdevice() and
 185 * unregister_netdevice(), which must be called with the rtnl
 186 * semaphore held.
 187 */
 188DEFINE_RWLOCK(dev_base_lock);
 189EXPORT_SYMBOL(dev_base_lock);
 190
 191static DEFINE_MUTEX(ifalias_mutex);
 192
 193/* protects napi_hash addition/deletion and napi_gen_id */
 194static DEFINE_SPINLOCK(napi_hash_lock);
 195
 196static unsigned int napi_gen_id = NR_CPUS;
 197static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 198
 199static DECLARE_RWSEM(devnet_rename_sem);
 200
 201static inline void dev_base_seq_inc(struct net *net)
 202{
 203        while (++net->dev_base_seq == 0)
 204                ;
 205}
 206
 207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 208{
 209        unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 210
 211        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 212}
 213
 214static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 215{
 216        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 217}
 218
 219static inline void rps_lock(struct softnet_data *sd)
 220{
 221#ifdef CONFIG_RPS
 222        spin_lock(&sd->input_pkt_queue.lock);
 223#endif
 224}
 225
 226static inline void rps_unlock(struct softnet_data *sd)
 227{
 228#ifdef CONFIG_RPS
 229        spin_unlock(&sd->input_pkt_queue.lock);
 230#endif
 231}
 232
 233static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
 234                                                       const char *name)
 235{
 236        struct netdev_name_node *name_node;
 237
 238        name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
 239        if (!name_node)
 240                return NULL;
 241        INIT_HLIST_NODE(&name_node->hlist);
 242        name_node->dev = dev;
 243        name_node->name = name;
 244        return name_node;
 245}
 246
 247static struct netdev_name_node *
 248netdev_name_node_head_alloc(struct net_device *dev)
 249{
 250        struct netdev_name_node *name_node;
 251
 252        name_node = netdev_name_node_alloc(dev, dev->name);
 253        if (!name_node)
 254                return NULL;
 255        INIT_LIST_HEAD(&name_node->list);
 256        return name_node;
 257}
 258
 259static void netdev_name_node_free(struct netdev_name_node *name_node)
 260{
 261        kfree(name_node);
 262}
 263
 264static void netdev_name_node_add(struct net *net,
 265                                 struct netdev_name_node *name_node)
 266{
 267        hlist_add_head_rcu(&name_node->hlist,
 268                           dev_name_hash(net, name_node->name));
 269}
 270
 271static void netdev_name_node_del(struct netdev_name_node *name_node)
 272{
 273        hlist_del_rcu(&name_node->hlist);
 274}
 275
 276static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
 277                                                        const char *name)
 278{
 279        struct hlist_head *head = dev_name_hash(net, name);
 280        struct netdev_name_node *name_node;
 281
 282        hlist_for_each_entry(name_node, head, hlist)
 283                if (!strcmp(name_node->name, name))
 284                        return name_node;
 285        return NULL;
 286}
 287
 288static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
 289                                                            const char *name)
 290{
 291        struct hlist_head *head = dev_name_hash(net, name);
 292        struct netdev_name_node *name_node;
 293
 294        hlist_for_each_entry_rcu(name_node, head, hlist)
 295                if (!strcmp(name_node->name, name))
 296                        return name_node;
 297        return NULL;
 298}
 299
 300int netdev_name_node_alt_create(struct net_device *dev, const char *name)
 301{
 302        struct netdev_name_node *name_node;
 303        struct net *net = dev_net(dev);
 304
 305        name_node = netdev_name_node_lookup(net, name);
 306        if (name_node)
 307                return -EEXIST;
 308        name_node = netdev_name_node_alloc(dev, name);
 309        if (!name_node)
 310                return -ENOMEM;
 311        netdev_name_node_add(net, name_node);
 312        /* The node that holds dev->name acts as a head of per-device list. */
 313        list_add_tail(&name_node->list, &dev->name_node->list);
 314
 315        return 0;
 316}
 317EXPORT_SYMBOL(netdev_name_node_alt_create);
 318
 319static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
 320{
 321        list_del(&name_node->list);
 322        netdev_name_node_del(name_node);
 323        kfree(name_node->name);
 324        netdev_name_node_free(name_node);
 325}
 326
 327int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
 328{
 329        struct netdev_name_node *name_node;
 330        struct net *net = dev_net(dev);
 331
 332        name_node = netdev_name_node_lookup(net, name);
 333        if (!name_node)
 334                return -ENOENT;
 335        /* lookup might have found our primary name or a name belonging
 336         * to another device.
 337         */
 338        if (name_node == dev->name_node || name_node->dev != dev)
 339                return -EINVAL;
 340
 341        __netdev_name_node_alt_destroy(name_node);
 342
 343        return 0;
 344}
 345EXPORT_SYMBOL(netdev_name_node_alt_destroy);
 346
 347static void netdev_name_node_alt_flush(struct net_device *dev)
 348{
 349        struct netdev_name_node *name_node, *tmp;
 350
 351        list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
 352                __netdev_name_node_alt_destroy(name_node);
 353}
 354
 355/* Device list insertion */
 356static void list_netdevice(struct net_device *dev)
 357{
 358        struct net *net = dev_net(dev);
 359
 360        ASSERT_RTNL();
 361
 362        write_lock_bh(&dev_base_lock);
 363        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 364        netdev_name_node_add(net, dev->name_node);
 365        hlist_add_head_rcu(&dev->index_hlist,
 366                           dev_index_hash(net, dev->ifindex));
 367        write_unlock_bh(&dev_base_lock);
 368
 369        dev_base_seq_inc(net);
 370}
 371
 372/* Device list removal
 373 * caller must respect a RCU grace period before freeing/reusing dev
 374 */
 375static void unlist_netdevice(struct net_device *dev)
 376{
 377        ASSERT_RTNL();
 378
 379        /* Unlink dev from the device chain */
 380        write_lock_bh(&dev_base_lock);
 381        list_del_rcu(&dev->dev_list);
 382        netdev_name_node_del(dev->name_node);
 383        hlist_del_rcu(&dev->index_hlist);
 384        write_unlock_bh(&dev_base_lock);
 385
 386        dev_base_seq_inc(dev_net(dev));
 387}
 388
 389/*
 390 *      Our notifier list
 391 */
 392
 393static RAW_NOTIFIER_HEAD(netdev_chain);
 394
 395/*
 396 *      Device drivers call our routines to queue packets here. We empty the
 397 *      queue in the local softnet handler.
 398 */
 399
 400DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 401EXPORT_PER_CPU_SYMBOL(softnet_data);
 402
 403#ifdef CONFIG_LOCKDEP
 404/*
 405 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 406 * according to dev->type
 407 */
 408static const unsigned short netdev_lock_type[] = {
 409         ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 410         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 411         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 412         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 413         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 414         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 415         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 416         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 417         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 418         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 419         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 420         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 421         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 422         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 423         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 424
 425static const char *const netdev_lock_name[] = {
 426        "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 427        "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 428        "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 429        "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 430        "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 431        "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 432        "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 433        "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 434        "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 435        "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 436        "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 437        "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 438        "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 439        "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 440        "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 441
 442static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 443static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 444
 445static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 446{
 447        int i;
 448
 449        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 450                if (netdev_lock_type[i] == dev_type)
 451                        return i;
 452        /* the last key is used by default */
 453        return ARRAY_SIZE(netdev_lock_type) - 1;
 454}
 455
 456static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 457                                                 unsigned short dev_type)
 458{
 459        int i;
 460
 461        i = netdev_lock_pos(dev_type);
 462        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 463                                   netdev_lock_name[i]);
 464}
 465
 466static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 467{
 468        int i;
 469
 470        i = netdev_lock_pos(dev->type);
 471        lockdep_set_class_and_name(&dev->addr_list_lock,
 472                                   &netdev_addr_lock_key[i],
 473                                   netdev_lock_name[i]);
 474}
 475#else
 476static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 477                                                 unsigned short dev_type)
 478{
 479}
 480
 481static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 482{
 483}
 484#endif
 485
 486/*******************************************************************************
 487 *
 488 *              Protocol management and registration routines
 489 *
 490 *******************************************************************************/
 491
 492
 493/*
 494 *      Add a protocol ID to the list. Now that the input handler is
 495 *      smarter we can dispense with all the messy stuff that used to be
 496 *      here.
 497 *
 498 *      BEWARE!!! Protocol handlers, mangling input packets,
 499 *      MUST BE last in hash buckets and checking protocol handlers
 500 *      MUST start from promiscuous ptype_all chain in net_bh.
 501 *      It is true now, do not change it.
 502 *      Explanation follows: if protocol handler, mangling packet, will
 503 *      be the first on list, it is not able to sense, that packet
 504 *      is cloned and should be copied-on-write, so that it will
 505 *      change it and subsequent readers will get broken packet.
 506 *                                                      --ANK (980803)
 507 */
 508
 509static inline struct list_head *ptype_head(const struct packet_type *pt)
 510{
 511        if (pt->type == htons(ETH_P_ALL))
 512                return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 513        else
 514                return pt->dev ? &pt->dev->ptype_specific :
 515                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 516}
 517
 518/**
 519 *      dev_add_pack - add packet handler
 520 *      @pt: packet type declaration
 521 *
 522 *      Add a protocol handler to the networking stack. The passed &packet_type
 523 *      is linked into kernel lists and may not be freed until it has been
 524 *      removed from the kernel lists.
 525 *
 526 *      This call does not sleep therefore it can not
 527 *      guarantee all CPU's that are in middle of receiving packets
 528 *      will see the new packet type (until the next received packet).
 529 */
 530
 531void dev_add_pack(struct packet_type *pt)
 532{
 533        struct list_head *head = ptype_head(pt);
 534
 535        spin_lock(&ptype_lock);
 536        list_add_rcu(&pt->list, head);
 537        spin_unlock(&ptype_lock);
 538}
 539EXPORT_SYMBOL(dev_add_pack);
 540
 541/**
 542 *      __dev_remove_pack        - remove packet handler
 543 *      @pt: packet type declaration
 544 *
 545 *      Remove a protocol handler that was previously added to the kernel
 546 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 547 *      from the kernel lists and can be freed or reused once this function
 548 *      returns.
 549 *
 550 *      The packet type might still be in use by receivers
 551 *      and must not be freed until after all the CPU's have gone
 552 *      through a quiescent state.
 553 */
 554void __dev_remove_pack(struct packet_type *pt)
 555{
 556        struct list_head *head = ptype_head(pt);
 557        struct packet_type *pt1;
 558
 559        spin_lock(&ptype_lock);
 560
 561        list_for_each_entry(pt1, head, list) {
 562                if (pt == pt1) {
 563                        list_del_rcu(&pt->list);
 564                        goto out;
 565                }
 566        }
 567
 568        pr_warn("dev_remove_pack: %p not found\n", pt);
 569out:
 570        spin_unlock(&ptype_lock);
 571}
 572EXPORT_SYMBOL(__dev_remove_pack);
 573
 574/**
 575 *      dev_remove_pack  - remove packet handler
 576 *      @pt: packet type declaration
 577 *
 578 *      Remove a protocol handler that was previously added to the kernel
 579 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 580 *      from the kernel lists and can be freed or reused once this function
 581 *      returns.
 582 *
 583 *      This call sleeps to guarantee that no CPU is looking at the packet
 584 *      type after return.
 585 */
 586void dev_remove_pack(struct packet_type *pt)
 587{
 588        __dev_remove_pack(pt);
 589
 590        synchronize_net();
 591}
 592EXPORT_SYMBOL(dev_remove_pack);
 593
 594
 595/**
 596 *      dev_add_offload - register offload handlers
 597 *      @po: protocol offload declaration
 598 *
 599 *      Add protocol offload handlers to the networking stack. The passed
 600 *      &proto_offload is linked into kernel lists and may not be freed until
 601 *      it has been removed from the kernel lists.
 602 *
 603 *      This call does not sleep therefore it can not
 604 *      guarantee all CPU's that are in middle of receiving packets
 605 *      will see the new offload handlers (until the next received packet).
 606 */
 607void dev_add_offload(struct packet_offload *po)
 608{
 609        struct packet_offload *elem;
 610
 611        spin_lock(&offload_lock);
 612        list_for_each_entry(elem, &offload_base, list) {
 613                if (po->priority < elem->priority)
 614                        break;
 615        }
 616        list_add_rcu(&po->list, elem->list.prev);
 617        spin_unlock(&offload_lock);
 618}
 619EXPORT_SYMBOL(dev_add_offload);
 620
 621/**
 622 *      __dev_remove_offload     - remove offload handler
 623 *      @po: packet offload declaration
 624 *
 625 *      Remove a protocol offload handler that was previously added to the
 626 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 627 *      is removed from the kernel lists and can be freed or reused once this
 628 *      function returns.
 629 *
 630 *      The packet type might still be in use by receivers
 631 *      and must not be freed until after all the CPU's have gone
 632 *      through a quiescent state.
 633 */
 634static void __dev_remove_offload(struct packet_offload *po)
 635{
 636        struct list_head *head = &offload_base;
 637        struct packet_offload *po1;
 638
 639        spin_lock(&offload_lock);
 640
 641        list_for_each_entry(po1, head, list) {
 642                if (po == po1) {
 643                        list_del_rcu(&po->list);
 644                        goto out;
 645                }
 646        }
 647
 648        pr_warn("dev_remove_offload: %p not found\n", po);
 649out:
 650        spin_unlock(&offload_lock);
 651}
 652
 653/**
 654 *      dev_remove_offload       - remove packet offload handler
 655 *      @po: packet offload declaration
 656 *
 657 *      Remove a packet offload handler that was previously added to the kernel
 658 *      offload handlers by dev_add_offload(). The passed &offload_type is
 659 *      removed from the kernel lists and can be freed or reused once this
 660 *      function returns.
 661 *
 662 *      This call sleeps to guarantee that no CPU is looking at the packet
 663 *      type after return.
 664 */
 665void dev_remove_offload(struct packet_offload *po)
 666{
 667        __dev_remove_offload(po);
 668
 669        synchronize_net();
 670}
 671EXPORT_SYMBOL(dev_remove_offload);
 672
 673/******************************************************************************
 674 *
 675 *                    Device Boot-time Settings Routines
 676 *
 677 ******************************************************************************/
 678
 679/* Boot time configuration table */
 680static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 681
 682/**
 683 *      netdev_boot_setup_add   - add new setup entry
 684 *      @name: name of the device
 685 *      @map: configured settings for the device
 686 *
 687 *      Adds new setup entry to the dev_boot_setup list.  The function
 688 *      returns 0 on error and 1 on success.  This is a generic routine to
 689 *      all netdevices.
 690 */
 691static int netdev_boot_setup_add(char *name, struct ifmap *map)
 692{
 693        struct netdev_boot_setup *s;
 694        int i;
 695
 696        s = dev_boot_setup;
 697        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 698                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 699                        memset(s[i].name, 0, sizeof(s[i].name));
 700                        strlcpy(s[i].name, name, IFNAMSIZ);
 701                        memcpy(&s[i].map, map, sizeof(s[i].map));
 702                        break;
 703                }
 704        }
 705
 706        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 707}
 708
 709/**
 710 * netdev_boot_setup_check      - check boot time settings
 711 * @dev: the netdevice
 712 *
 713 * Check boot time settings for the device.
 714 * The found settings are set for the device to be used
 715 * later in the device probing.
 716 * Returns 0 if no settings found, 1 if they are.
 717 */
 718int netdev_boot_setup_check(struct net_device *dev)
 719{
 720        struct netdev_boot_setup *s = dev_boot_setup;
 721        int i;
 722
 723        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 724                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 725                    !strcmp(dev->name, s[i].name)) {
 726                        dev->irq = s[i].map.irq;
 727                        dev->base_addr = s[i].map.base_addr;
 728                        dev->mem_start = s[i].map.mem_start;
 729                        dev->mem_end = s[i].map.mem_end;
 730                        return 1;
 731                }
 732        }
 733        return 0;
 734}
 735EXPORT_SYMBOL(netdev_boot_setup_check);
 736
 737
 738/**
 739 * netdev_boot_base     - get address from boot time settings
 740 * @prefix: prefix for network device
 741 * @unit: id for network device
 742 *
 743 * Check boot time settings for the base address of device.
 744 * The found settings are set for the device to be used
 745 * later in the device probing.
 746 * Returns 0 if no settings found.
 747 */
 748unsigned long netdev_boot_base(const char *prefix, int unit)
 749{
 750        const struct netdev_boot_setup *s = dev_boot_setup;
 751        char name[IFNAMSIZ];
 752        int i;
 753
 754        sprintf(name, "%s%d", prefix, unit);
 755
 756        /*
 757         * If device already registered then return base of 1
 758         * to indicate not to probe for this interface
 759         */
 760        if (__dev_get_by_name(&init_net, name))
 761                return 1;
 762
 763        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 764                if (!strcmp(name, s[i].name))
 765                        return s[i].map.base_addr;
 766        return 0;
 767}
 768
 769/*
 770 * Saves at boot time configured settings for any netdevice.
 771 */
 772int __init netdev_boot_setup(char *str)
 773{
 774        int ints[5];
 775        struct ifmap map;
 776
 777        str = get_options(str, ARRAY_SIZE(ints), ints);
 778        if (!str || !*str)
 779                return 0;
 780
 781        /* Save settings */
 782        memset(&map, 0, sizeof(map));
 783        if (ints[0] > 0)
 784                map.irq = ints[1];
 785        if (ints[0] > 1)
 786                map.base_addr = ints[2];
 787        if (ints[0] > 2)
 788                map.mem_start = ints[3];
 789        if (ints[0] > 3)
 790                map.mem_end = ints[4];
 791
 792        /* Add new entry to the list */
 793        return netdev_boot_setup_add(str, &map);
 794}
 795
 796__setup("netdev=", netdev_boot_setup);
 797
 798/*******************************************************************************
 799 *
 800 *                          Device Interface Subroutines
 801 *
 802 *******************************************************************************/
 803
 804/**
 805 *      dev_get_iflink  - get 'iflink' value of a interface
 806 *      @dev: targeted interface
 807 *
 808 *      Indicates the ifindex the interface is linked to.
 809 *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 810 */
 811
 812int dev_get_iflink(const struct net_device *dev)
 813{
 814        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 815                return dev->netdev_ops->ndo_get_iflink(dev);
 816
 817        return dev->ifindex;
 818}
 819EXPORT_SYMBOL(dev_get_iflink);
 820
 821/**
 822 *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 823 *      @dev: targeted interface
 824 *      @skb: The packet.
 825 *
 826 *      For better visibility of tunnel traffic OVS needs to retrieve
 827 *      egress tunnel information for a packet. Following API allows
 828 *      user to get this info.
 829 */
 830int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 831{
 832        struct ip_tunnel_info *info;
 833
 834        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 835                return -EINVAL;
 836
 837        info = skb_tunnel_info_unclone(skb);
 838        if (!info)
 839                return -ENOMEM;
 840        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 841                return -EINVAL;
 842
 843        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 844}
 845EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 846
 847/**
 848 *      __dev_get_by_name       - find a device by its name
 849 *      @net: the applicable net namespace
 850 *      @name: name to find
 851 *
 852 *      Find an interface by name. Must be called under RTNL semaphore
 853 *      or @dev_base_lock. If the name is found a pointer to the device
 854 *      is returned. If the name is not found then %NULL is returned. The
 855 *      reference counters are not incremented so the caller must be
 856 *      careful with locks.
 857 */
 858
 859struct net_device *__dev_get_by_name(struct net *net, const char *name)
 860{
 861        struct netdev_name_node *node_name;
 862
 863        node_name = netdev_name_node_lookup(net, name);
 864        return node_name ? node_name->dev : NULL;
 865}
 866EXPORT_SYMBOL(__dev_get_by_name);
 867
 868/**
 869 * dev_get_by_name_rcu  - find a device by its name
 870 * @net: the applicable net namespace
 871 * @name: name to find
 872 *
 873 * Find an interface by name.
 874 * If the name is found a pointer to the device is returned.
 875 * If the name is not found then %NULL is returned.
 876 * The reference counters are not incremented so the caller must be
 877 * careful with locks. The caller must hold RCU lock.
 878 */
 879
 880struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 881{
 882        struct netdev_name_node *node_name;
 883
 884        node_name = netdev_name_node_lookup_rcu(net, name);
 885        return node_name ? node_name->dev : NULL;
 886}
 887EXPORT_SYMBOL(dev_get_by_name_rcu);
 888
 889/**
 890 *      dev_get_by_name         - find a device by its name
 891 *      @net: the applicable net namespace
 892 *      @name: name to find
 893 *
 894 *      Find an interface by name. This can be called from any
 895 *      context and does its own locking. The returned handle has
 896 *      the usage count incremented and the caller must use dev_put() to
 897 *      release it when it is no longer needed. %NULL is returned if no
 898 *      matching device is found.
 899 */
 900
 901struct net_device *dev_get_by_name(struct net *net, const char *name)
 902{
 903        struct net_device *dev;
 904
 905        rcu_read_lock();
 906        dev = dev_get_by_name_rcu(net, name);
 907        if (dev)
 908                dev_hold(dev);
 909        rcu_read_unlock();
 910        return dev;
 911}
 912EXPORT_SYMBOL(dev_get_by_name);
 913
 914/**
 915 *      __dev_get_by_index - find a device by its ifindex
 916 *      @net: the applicable net namespace
 917 *      @ifindex: index of device
 918 *
 919 *      Search for an interface by index. Returns %NULL if the device
 920 *      is not found or a pointer to the device. The device has not
 921 *      had its reference counter increased so the caller must be careful
 922 *      about locking. The caller must hold either the RTNL semaphore
 923 *      or @dev_base_lock.
 924 */
 925
 926struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 927{
 928        struct net_device *dev;
 929        struct hlist_head *head = dev_index_hash(net, ifindex);
 930
 931        hlist_for_each_entry(dev, head, index_hlist)
 932                if (dev->ifindex == ifindex)
 933                        return dev;
 934
 935        return NULL;
 936}
 937EXPORT_SYMBOL(__dev_get_by_index);
 938
 939/**
 940 *      dev_get_by_index_rcu - find a device by its ifindex
 941 *      @net: the applicable net namespace
 942 *      @ifindex: index of device
 943 *
 944 *      Search for an interface by index. Returns %NULL if the device
 945 *      is not found or a pointer to the device. The device has not
 946 *      had its reference counter increased so the caller must be careful
 947 *      about locking. The caller must hold RCU lock.
 948 */
 949
 950struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 951{
 952        struct net_device *dev;
 953        struct hlist_head *head = dev_index_hash(net, ifindex);
 954
 955        hlist_for_each_entry_rcu(dev, head, index_hlist)
 956                if (dev->ifindex == ifindex)
 957                        return dev;
 958
 959        return NULL;
 960}
 961EXPORT_SYMBOL(dev_get_by_index_rcu);
 962
 963
 964/**
 965 *      dev_get_by_index - find a device by its ifindex
 966 *      @net: the applicable net namespace
 967 *      @ifindex: index of device
 968 *
 969 *      Search for an interface by index. Returns NULL if the device
 970 *      is not found or a pointer to the device. The device returned has
 971 *      had a reference added and the pointer is safe until the user calls
 972 *      dev_put to indicate they have finished with it.
 973 */
 974
 975struct net_device *dev_get_by_index(struct net *net, int ifindex)
 976{
 977        struct net_device *dev;
 978
 979        rcu_read_lock();
 980        dev = dev_get_by_index_rcu(net, ifindex);
 981        if (dev)
 982                dev_hold(dev);
 983        rcu_read_unlock();
 984        return dev;
 985}
 986EXPORT_SYMBOL(dev_get_by_index);
 987
 988/**
 989 *      dev_get_by_napi_id - find a device by napi_id
 990 *      @napi_id: ID of the NAPI struct
 991 *
 992 *      Search for an interface by NAPI ID. Returns %NULL if the device
 993 *      is not found or a pointer to the device. The device has not had
 994 *      its reference counter increased so the caller must be careful
 995 *      about locking. The caller must hold RCU lock.
 996 */
 997
 998struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 999{
1000        struct napi_struct *napi;

1001
1002        WARN_ON_ONCE(!rcu_read_lock_held());
1003
1004        if (napi_id < MIN_NAPI_ID)
1005                return NULL;
1006
1007        napi = napi_by_id(napi_id);
1008
1009        return napi ? napi->dev : NULL;
1010}
1011EXPORT_SYMBOL(dev_get_by_napi_id);
1012
1013/**
1014 *      netdev_get_name - get a netdevice name, knowing its ifindex.
1015 *      @net: network namespace
1016 *      @name: a pointer to the buffer where the name will be stored.
1017 *      @ifindex: the ifindex of the interface to get the name from.
1018 */
1019int netdev_get_name(struct net *net, char *name, int ifindex)
1020{
1021        struct net_device *dev;
1022        int ret;
1023
1024        down_read(&devnet_rename_sem);
1025        rcu_read_lock();
1026
1027        dev = dev_get_by_index_rcu(net, ifindex);
1028        if (!dev) {
1029                ret = -ENODEV;
1030                goto out;
1031        }
1032
1033        strcpy(name, dev->name);
1034
1035        ret = 0;
1036out:
1037        rcu_read_unlock();
1038        up_read(&devnet_rename_sem);
1039        return ret;
1040}
1041
1042/**
1043 *      dev_getbyhwaddr_rcu - find a device by its hardware address
1044 *      @net: the applicable net namespace
1045 *      @type: media type of device
1046 *      @ha: hardware address
1047 *
1048 *      Search for an interface by MAC address. Returns NULL if the device
1049 *      is not found or a pointer to the device.
1050 *      The caller must hold RCU or RTNL.
1051 *      The returned device has not had its ref count increased
1052 *      and the caller must therefore be careful about locking
1053 *
1054 */
1055
1056struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
1057                                       const char *ha)
1058{
1059        struct net_device *dev;
1060
1061        for_each_netdev_rcu(net, dev)
1062                if (dev->type == type &&
1063                    !memcmp(dev->dev_addr, ha, dev->addr_len))
1064                        return dev;
1065
1066        return NULL;
1067}
1068EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
1069
1070struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1071{
1072        struct net_device *dev;
1073
1074        ASSERT_RTNL();
1075        for_each_netdev(net, dev)
1076                if (dev->type == type)
1077                        return dev;
1078
1079        return NULL;
1080}
1081EXPORT_SYMBOL(__dev_getfirstbyhwtype);
1082
1083struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1084{
1085        struct net_device *dev, *ret = NULL;
1086
1087        rcu_read_lock();
1088        for_each_netdev_rcu(net, dev)
1089                if (dev->type == type) {
1090                        dev_hold(dev);
1091                        ret = dev;
1092                        break;
1093                }
1094        rcu_read_unlock();
1095        return ret;
1096}
1097EXPORT_SYMBOL(dev_getfirstbyhwtype);
1098
1099/**
1100 *      __dev_get_by_flags - find any device with given flags
1101 *      @net: the applicable net namespace
1102 *      @if_flags: IFF_* values
1103 *      @mask: bitmask of bits in if_flags to check
1104 *
1105 *      Search for any interface with the given flags. Returns NULL if a device
1106 *      is not found or a pointer to the device. Must be called inside
1107 *      rtnl_lock(), and result refcount is unchanged.
1108 */
1109
1110struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1111                                      unsigned short mask)
1112{
1113        struct net_device *dev, *ret;
1114
1115        ASSERT_RTNL();
1116
1117        ret = NULL;
1118        for_each_netdev(net, dev) {
1119                if (((dev->flags ^ if_flags) & mask) == 0) {
1120                        ret = dev;
1121                        break;
1122                }
1123        }
1124        return ret;
1125}
1126EXPORT_SYMBOL(__dev_get_by_flags);
1127
1128/**
1129 *      dev_valid_name - check if name is okay for network device
1130 *      @name: name string
1131 *
1132 *      Network device names need to be valid file names to
1133 *      to allow sysfs to work.  We also disallow any kind of
1134 *      whitespace.
1135 */
1136bool dev_valid_name(const char *name)
1137{
1138        if (*name == '\0')
1139                return false;
1140        if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1141                return false;
1142        if (!strcmp(name, ".") || !strcmp(name, ".."))
1143                return false;
1144
1145        while (*name) {
1146                if (*name == '/' || *name == ':' || isspace(*name))
1147                        return false;
1148                name++;
1149        }
1150        return true;
1151}
1152EXPORT_SYMBOL(dev_valid_name);
1153
1154/**
1155 *      __dev_alloc_name - allocate a name for a device
1156 *      @net: network namespace to allocate the device name in
1157 *      @name: name format string
1158 *      @buf:  scratch buffer and result name string
1159 *
1160 *      Passed a format string - eg "lt%d" it will try and find a suitable
1161 *      id. It scans list of devices to build up a free map, then chooses
1162 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1163 *      while allocating the name and adding the device in order to avoid
1164 *      duplicates.
1165 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1166 *      Returns the number of the unit assigned or a negative errno code.
1167 */
1168
1169static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1170{
1171        int i = 0;
1172        const char *p;
1173        const int max_netdevices = 8*PAGE_SIZE;
1174        unsigned long *inuse;
1175        struct net_device *d;
1176
1177        if (!dev_valid_name(name))
1178                return -EINVAL;
1179
1180        p = strchr(name, '%');
1181        if (p) {
1182                /*
1183                 * Verify the string as this thing may have come from
1184                 * the user.  There must be either one "%d" and no other "%"
1185                 * characters.
1186                 */
1187                if (p[1] != 'd' || strchr(p + 2, '%'))
1188                        return -EINVAL;
1189
1190                /* Use one page as a bit array of possible slots */
1191                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1192                if (!inuse)
1193                        return -ENOMEM;
1194
1195                for_each_netdev(net, d) {
1196                        if (!sscanf(d->name, name, &i))
1197                                continue;
1198                        if (i < 0 || i >= max_netdevices)
1199                                continue;
1200
1201                        /*  avoid cases where sscanf is not exact inverse of printf */
1202                        snprintf(buf, IFNAMSIZ, name, i);
1203                        if (!strncmp(buf, d->name, IFNAMSIZ))
1204                                set_bit(i, inuse);
1205                }
1206
1207                i = find_first_zero_bit(inuse, max_netdevices);
1208                free_page((unsigned long) inuse);
1209        }
1210
1211        snprintf(buf, IFNAMSIZ, name, i);
1212        if (!__dev_get_by_name(net, buf))
1213                return i;
1214
1215        /* It is possible to run out of possible slots
1216         * when the name is long and there isn't enough space left
1217         * for the digits, or if all bits are used.
1218         */
1219        return -ENFILE;
1220}
1221
1222static int dev_alloc_name_ns(struct net *net,
1223                             struct net_device *dev,
1224                             const char *name)
1225{
1226        char buf[IFNAMSIZ];
1227        int ret;
1228
1229        BUG_ON(!net);
1230        ret = __dev_alloc_name(net, name, buf);
1231        if (ret >= 0)
1232                strlcpy(dev->name, buf, IFNAMSIZ);
1233        return ret;
1234}
1235
1236/**
1237 *      dev_alloc_name - allocate a name for a device
1238 *      @dev: device
1239 *      @name: name format string
1240 *
1241 *      Passed a format string - eg "lt%d" it will try and find a suitable
1242 *      id. It scans list of devices to build up a free map, then chooses
1243 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1244 *      while allocating the name and adding the device in order to avoid
1245 *      duplicates.
1246 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1247 *      Returns the number of the unit assigned or a negative errno code.
1248 */
1249
1250int dev_alloc_name(struct net_device *dev, const char *name)
1251{
1252        return dev_alloc_name_ns(dev_net(dev), dev, name);
1253}
1254EXPORT_SYMBOL(dev_alloc_name);
1255
1256static int dev_get_valid_name(struct net *net, struct net_device *dev,
1257                              const char *name)
1258{
1259        BUG_ON(!net);
1260
1261        if (!dev_valid_name(name))
1262                return -EINVAL;
1263
1264        if (strchr(name, '%'))
1265                return dev_alloc_name_ns(net, dev, name);
1266        else if (__dev_get_by_name(net, name))
1267                return -EEXIST;
1268        else if (dev->name != name)
1269                strlcpy(dev->name, name, IFNAMSIZ);
1270
1271        return 0;
1272}
1273
1274/**
1275 *      dev_change_name - change name of a device
1276 *      @dev: device
1277 *      @newname: name (or format string) must be at least IFNAMSIZ
1278 *
1279 *      Change name of a device, can pass format strings "eth%d".
1280 *      for wildcarding.
1281 */
1282int dev_change_name(struct net_device *dev, const char *newname)
1283{
1284        unsigned char old_assign_type;
1285        char oldname[IFNAMSIZ];
1286        int err = 0;
1287        int ret;
1288        struct net *net;
1289
1290        ASSERT_RTNL();
1291        BUG_ON(!dev_net(dev));
1292
1293        net = dev_net(dev);
1294
1295        /* Some auto-enslaved devices e.g. failover slaves are
1296         * special, as userspace might rename the device after
1297         * the interface had been brought up and running since
1298         * the point kernel initiated auto-enslavement. Allow
1299         * live name change even when these slave devices are
1300         * up and running.
1301         *
1302         * Typically, users of these auto-enslaving devices
1303         * don't actually care about slave name change, as
1304         * they are supposed to operate on master interface
1305         * directly.
1306         */
1307        if (dev->flags & IFF_UP &&
1308            likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
1309                return -EBUSY;
1310
1311        down_write(&devnet_rename_sem);
1312
1313        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1314                up_write(&devnet_rename_sem);
1315                return 0;
1316        }
1317
1318        memcpy(oldname, dev->name, IFNAMSIZ);
1319
1320        err = dev_get_valid_name(net, dev, newname);
1321        if (err < 0) {
1322                up_write(&devnet_rename_sem);
1323                return err;
1324        }
1325
1326        if (oldname[0] && !strchr(oldname, '%'))
1327                netdev_info(dev, "renamed from %s\n", oldname);
1328
1329        old_assign_type = dev->name_assign_type;
1330        dev->name_assign_type = NET_NAME_RENAMED;
1331
1332rollback:
1333        ret = device_rename(&dev->dev, dev->name);
1334        if (ret) {
1335                memcpy(dev->name, oldname, IFNAMSIZ);
1336                dev->name_assign_type = old_assign_type;
1337                up_write(&devnet_rename_sem);
1338                return ret;
1339        }
1340
1341        up_write(&devnet_rename_sem);
1342
1343        netdev_adjacent_rename_links(dev, oldname);
1344
1345        write_lock_bh(&dev_base_lock);
1346        netdev_name_node_del(dev->name_node);
1347        write_unlock_bh(&dev_base_lock);
1348
1349        synchronize_rcu();
1350
1351        write_lock_bh(&dev_base_lock);
1352        netdev_name_node_add(net, dev->name_node);
1353        write_unlock_bh(&dev_base_lock);
1354
1355        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1356        ret = notifier_to_errno(ret);
1357
1358        if (ret) {
1359                /* err >= 0 after dev_alloc_name() or stores the first errno */
1360                if (err >= 0) {
1361                        err = ret;
1362                        down_write(&devnet_rename_sem);
1363                        memcpy(dev->name, oldname, IFNAMSIZ);
1364                        memcpy(oldname, newname, IFNAMSIZ);
1365                        dev->name_assign_type = old_assign_type;
1366                        old_assign_type = NET_NAME_RENAMED;
1367                        goto rollback;
1368                } else {
1369                        pr_err("%s: name change rollback failed: %d\n",
1370                               dev->name, ret);
1371                }
1372        }
1373
1374        return err;
1375}
1376
1377/**
1378 *      dev_set_alias - change ifalias of a device
1379 *      @dev: device
1380 *      @alias: name up to IFALIASZ
1381 *      @len: limit of bytes to copy from info
1382 *
1383 *      Set ifalias for a device,
1384 */
1385int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1386{
1387        struct dev_ifalias *new_alias = NULL;
1388
1389        if (len >= IFALIASZ)
1390                return -EINVAL;
1391
1392        if (len) {
1393                new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1394                if (!new_alias)
1395                        return -ENOMEM;
1396
1397                memcpy(new_alias->ifalias, alias, len);
1398                new_alias->ifalias[len] = 0;
1399        }
1400
1401        mutex_lock(&ifalias_mutex);
1402        new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1403                                        mutex_is_locked(&ifalias_mutex));
1404        mutex_unlock(&ifalias_mutex);
1405
1406        if (new_alias)
1407                kfree_rcu(new_alias, rcuhead);
1408
1409        return len;
1410}
1411EXPORT_SYMBOL(dev_set_alias);
1412
1413/**
1414 *      dev_get_alias - get ifalias of a device
1415 *      @dev: device
1416 *      @name: buffer to store name of ifalias
1417 *      @len: size of buffer
1418 *
1419 *      get ifalias for a device.  Caller must make sure dev cannot go
1420 *      away,  e.g. rcu read lock or own a reference count to device.
1421 */
1422int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1423{
1424        const struct dev_ifalias *alias;
1425        int ret = 0;
1426
1427        rcu_read_lock();
1428        alias = rcu_dereference(dev->ifalias);
1429        if (alias)
1430                ret = snprintf(name, len, "%s", alias->ifalias);
1431        rcu_read_unlock();
1432
1433        return ret;
1434}
1435
1436/**
1437 *      netdev_features_change - device changes features
1438 *      @dev: device to cause notification
1439 *
1440 *      Called to indicate a device has changed features.
1441 */
1442void netdev_features_change(struct net_device *dev)
1443{
1444        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1445}
1446EXPORT_SYMBOL(netdev_features_change);
1447
1448/**
1449 *      netdev_state_change - device changes state
1450 *      @dev: device to cause notification
1451 *
1452 *      Called to indicate a device has changed state. This function calls
1453 *      the notifier chains for netdev_chain and sends a NEWLINK message
1454 *      to the routing socket.
1455 */
1456void netdev_state_change(struct net_device *dev)
1457{
1458        if (dev->flags & IFF_UP) {
1459                struct netdev_notifier_change_info change_info = {
1460                        .info.dev = dev,
1461                };
1462
1463                call_netdevice_notifiers_info(NETDEV_CHANGE,
1464                                              &change_info.info);
1465                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1466        }
1467}
1468EXPORT_SYMBOL(netdev_state_change);
1469
1470/**
1471 * netdev_notify_peers - notify network peers about existence of @dev
1472 * @dev: network device
1473 *
1474 * Generate traffic such that interested network peers are aware of
1475 * @dev, such as by generating a gratuitous ARP. This may be used when
1476 * a device wants to inform the rest of the network about some sort of
1477 * reconfiguration such as a failover event or virtual machine
1478 * migration.
1479 */
1480void netdev_notify_peers(struct net_device *dev)
1481{
1482        rtnl_lock();
1483        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1484        call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1485        rtnl_unlock();
1486}
1487EXPORT_SYMBOL(netdev_notify_peers);
1488
1489static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1490{
1491        const struct net_device_ops *ops = dev->netdev_ops;
1492        int ret;
1493
1494        ASSERT_RTNL();
1495
1496        if (!netif_device_present(dev)) {
1497                /* may be detached because parent is runtime-suspended */
1498                if (dev->dev.parent)
1499                        pm_runtime_resume(dev->dev.parent);
1500                if (!netif_device_present(dev))
1501                        return -ENODEV;
1502        }
1503
1504        /* Block netpoll from trying to do any rx path servicing.
1505         * If we don't do this there is a chance ndo_poll_controller
1506         * or ndo_poll may be running while we open the device
1507         */
1508        netpoll_poll_disable(dev);
1509
1510        ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1511        ret = notifier_to_errno(ret);
1512        if (ret)
1513                return ret;
1514
1515        set_bit(__LINK_STATE_START, &dev->state);
1516
1517        if (ops->ndo_validate_addr)
1518                ret = ops->ndo_validate_addr(dev);
1519
1520        if (!ret && ops->ndo_open)
1521                ret = ops->ndo_open(dev);
1522
1523        netpoll_poll_enable(dev);
1524
1525        if (ret)
1526                clear_bit(__LINK_STATE_START, &dev->state);
1527        else {
1528                dev->flags |= IFF_UP;
1529                dev_set_rx_mode(dev);
1530                dev_activate(dev);
1531                add_device_randomness(dev->dev_addr, dev->addr_len);
1532        }
1533
1534        return ret;
1535}
1536
1537/**
1538 *      dev_open        - prepare an interface for use.
1539 *      @dev: device to open
1540 *      @extack: netlink extended ack
1541 *
1542 *      Takes a device from down to up state. The device's private open
1543 *      function is invoked and then the multicast lists are loaded. Finally
1544 *      the device is moved into the up state and a %NETDEV_UP message is
1545 *      sent to the netdev notifier chain.
1546 *
1547 *      Calling this function on an active interface is a nop. On a failure
1548 *      a negative errno code is returned.
1549 */
1550int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1551{
1552        int ret;
1553
1554        if (dev->flags & IFF_UP)
1555                return 0;
1556
1557        ret = __dev_open(dev, extack);
1558        if (ret < 0)
1559                return ret;
1560
1561        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1562        call_netdevice_notifiers(NETDEV_UP, dev);
1563
1564        return ret;
1565}
1566EXPORT_SYMBOL(dev_open);
1567
1568static void __dev_close_many(struct list_head *head)
1569{
1570        struct net_device *dev;
1571
1572        ASSERT_RTNL();
1573        might_sleep();
1574
1575        list_for_each_entry(dev, head, close_list) {
1576                /* Temporarily disable netpoll until the interface is down */
1577                netpoll_poll_disable(dev);
1578
1579                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1580
1581                clear_bit(__LINK_STATE_START, &dev->state);
1582
1583                /* Synchronize to scheduled poll. We cannot touch poll list, it
1584                 * can be even on different cpu. So just clear netif_running().
1585                 *
1586                 * dev->stop() will invoke napi_disable() on all of it's
1587                 * napi_struct instances on this device.
1588                 */
1589                smp_mb__after_atomic(); /* Commit netif_running(). */
1590        }
1591
1592        dev_deactivate_many(head);
1593
1594        list_for_each_entry(dev, head, close_list) {
1595                const struct net_device_ops *ops = dev->netdev_ops;
1596
1597                /*
1598                 *      Call the device specific close. This cannot fail.
1599                 *      Only if device is UP
1600                 *
1601                 *      We allow it to be called even after a DETACH hot-plug
1602                 *      event.
1603                 */
1604                if (ops->ndo_stop)
1605                        ops->ndo_stop(dev);
1606
1607                dev->flags &= ~IFF_UP;
1608                netpoll_poll_enable(dev);
1609        }
1610}
1611
1612static void __dev_close(struct net_device *dev)
1613{
1614        LIST_HEAD(single);
1615
1616        list_add(&dev->close_list, &single);
1617        __dev_close_many(&single);
1618        list_del(&single);
1619}
1620
1621void dev_close_many(struct list_head *head, bool unlink)
1622{
1623        struct net_device *dev, *tmp;
1624
1625        /* Remove the devices that don't need to be closed */
1626        list_for_each_entry_safe(dev, tmp, head, close_list)
1627                if (!(dev->flags & IFF_UP))
1628                        list_del_init(&dev->close_list);
1629
1630        __dev_close_many(head);
1631
1632        list_for_each_entry_safe(dev, tmp, head, close_list) {
1633                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1634                call_netdevice_notifiers(NETDEV_DOWN, dev);
1635                if (unlink)
1636                        list_del_init(&dev->close_list);
1637        }
1638}
1639EXPORT_SYMBOL(dev_close_many);
1640
1641/**
1642 *      dev_close - shutdown an interface.
1643 *      @dev: device to shutdown
1644 *
1645 *      This function moves an active device into down state. A
1646 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1647 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1648 *      chain.
1649 */
1650void dev_close(struct net_device *dev)
1651{
1652        if (dev->flags & IFF_UP) {
1653                LIST_HEAD(single);
1654
1655                list_add(&dev->close_list, &single);
1656                dev_close_many(&single, true);
1657                list_del(&single);
1658        }
1659}
1660EXPORT_SYMBOL(dev_close);
1661
1662
1663/**
1664 *      dev_disable_lro - disable Large Receive Offload on a device
1665 *      @dev: device
1666 *
1667 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1668 *      called under RTNL.  This is needed if received packets may be
1669 *      forwarded to another interface.
1670 */
1671void dev_disable_lro(struct net_device *dev)
1672{
1673        struct net_device *lower_dev;
1674        struct list_head *iter;
1675
1676        dev->wanted_features &= ~NETIF_F_LRO;
1677        netdev_update_features(dev);
1678
1679        if (unlikely(dev->features & NETIF_F_LRO))
1680                netdev_WARN(dev, "failed to disable LRO!\n");
1681
1682        netdev_for_each_lower_dev(dev, lower_dev, iter)
1683                dev_disable_lro(lower_dev);
1684}
1685EXPORT_SYMBOL(dev_disable_lro);
1686
1687/**
1688 *      dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1689 *      @dev: device
1690 *
1691 *      Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1692 *      called under RTNL.  This is needed if Generic XDP is installed on
1693 *      the device.
1694 */
1695static void dev_disable_gro_hw(struct net_device *dev)
1696{
1697        dev->wanted_features &= ~NETIF_F_GRO_HW;
1698        netdev_update_features(dev);
1699
1700        if (unlikely(dev->features & NETIF_F_GRO_HW))
1701                netdev_WARN(dev, "failed to disable GRO_HW!\n");
1702}
1703
1704const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1705{
1706#define N(val)                                          \
1707        case NETDEV_##val:                              \
1708                return "NETDEV_" __stringify(val);
1709        switch (cmd) {
1710        N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1711        N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1712        N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1713        N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1714        N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1715        N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1716        N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1717        N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1718        N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1719        N(PRE_CHANGEADDR)
1720        }
1721#undef N
1722        return "UNKNOWN_NETDEV_EVENT";
1723}
1724EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1725
1726static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1727                                   struct net_device *dev)
1728{
1729        struct netdev_notifier_info info = {
1730                .dev = dev,
1731        };
1732
1733        return nb->notifier_call(nb, val, &info);
1734}
1735
1736static int call_netdevice_register_notifiers(struct notifier_block *nb,
1737                                             struct net_device *dev)
1738{
1739        int err;
1740
1741        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1742        err = notifier_to_errno(err);
1743        if (err)
1744                return err;
1745
1746        if (!(dev->flags & IFF_UP))
1747                return 0;
1748
1749        call_netdevice_notifier(nb, NETDEV_UP, dev);
1750        return 0;
1751}
1752
1753static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1754                                                struct net_device *dev)
1755{
1756        if (dev->flags & IFF_UP) {
1757                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1758                                        dev);
1759                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1760        }
1761        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1762}
1763
1764static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1765                                                 struct net *net)
1766{
1767        struct net_device *dev;
1768        int err;
1769
1770        for_each_netdev(net, dev) {
1771                err = call_netdevice_register_notifiers(nb, dev);
1772                if (err)
1773                        goto rollback;
1774        }
1775        return 0;
1776
1777rollback:
1778        for_each_netdev_continue_reverse(net, dev)
1779                call_netdevice_unregister_notifiers(nb, dev);
1780        return err;
1781}
1782
1783static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1784                                                    struct net *net)
1785{
1786        struct net_device *dev;
1787
1788        for_each_netdev(net, dev)
1789                call_netdevice_unregister_notifiers(nb, dev);
1790}
1791
1792static int dev_boot_phase = 1;
1793
1794/**
1795 * register_netdevice_notifier - register a network notifier block
1796 * @nb: notifier
1797 *
1798 * Register a notifier to be called when network device events occur.
1799 * The notifier passed is linked into the kernel structures and must
1800 * not be reused until it has been unregistered. A negative errno code
1801 * is returned on a failure.
1802 *
1803 * When registered all registration and up events are replayed
1804 * to the new notifier to allow device to have a race free
1805 * view of the network device list.
1806 */
1807
1808int register_netdevice_notifier(struct notifier_block *nb)
1809{
1810        struct net *net;
1811        int err;
1812
1813        /* Close race with setup_net() and cleanup_net() */
1814        down_write(&pernet_ops_rwsem);
1815        rtnl_lock();
1816        err = raw_notifier_chain_register(&netdev_chain, nb);
1817        if (err)
1818                goto unlock;
1819        if (dev_boot_phase)
1820                goto unlock;
1821        for_each_net(net) {
1822                err = call_netdevice_register_net_notifiers(nb, net);
1823                if (err)
1824                        goto rollback;
1825        }
1826
1827unlock:
1828        rtnl_unlock();
1829        up_write(&pernet_ops_rwsem);
1830        return err;
1831
1832rollback:
1833        for_each_net_continue_reverse(net)
1834                call_netdevice_unregister_net_notifiers(nb, net);
1835
1836        raw_notifier_chain_unregister(&netdev_chain, nb);
1837        goto unlock;
1838}
1839EXPORT_SYMBOL(register_netdevice_notifier);
1840
1841/**
1842 * unregister_netdevice_notifier - unregister a network notifier block
1843 * @nb: notifier
1844 *
1845 * Unregister a notifier previously registered by
1846 * register_netdevice_notifier(). The notifier is unlinked into the
1847 * kernel structures and may then be reused. A negative errno code
1848 * is returned on a failure.
1849 *
1850 * After unregistering unregister and down device events are synthesized
1851 * for all devices on the device list to the removed notifier to remove
1852 * the need for special case cleanup code.
1853 */
1854
1855int unregister_netdevice_notifier(struct notifier_block *nb)
1856{
1857        struct net *net;
1858        int err;
1859
1860        /* Close race with setup_net() and cleanup_net() */
1861        down_write(&pernet_ops_rwsem);
1862        rtnl_lock();
1863        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1864        if (err)
1865                goto unlock;
1866
1867        for_each_net(net)
1868                call_netdevice_unregister_net_notifiers(nb, net);
1869
1870unlock:
1871        rtnl_unlock();
1872        up_write(&pernet_ops_rwsem);
1873        return err;
1874}
1875EXPORT_SYMBOL(unregister_netdevice_notifier);
1876
1877static int __register_netdevice_notifier_net(struct net *net,
1878                                             struct notifier_block *nb,
1879                                             bool ignore_call_fail)
1880{
1881        int err;
1882
1883        err = raw_notifier_chain_register(&net->netdev_chain, nb);
1884        if (err)
1885                return err;
1886        if (dev_boot_phase)
1887                return 0;
1888
1889        err = call_netdevice_register_net_notifiers(nb, net);
1890        if (err && !ignore_call_fail)
1891                goto chain_unregister;
1892
1893        return 0;
1894
1895chain_unregister:
1896        raw_notifier_chain_unregister(&net->netdev_chain, nb);
1897        return err;
1898}
1899
1900static int __unregister_netdevice_notifier_net(struct net *net,
1901                                               struct notifier_block *nb)
1902{
1903        int err;
1904
1905        err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1906        if (err)
1907                return err;
1908
1909        call_netdevice_unregister_net_notifiers(nb, net);
1910        return 0;
1911}
1912
1913/**
1914 * register_netdevice_notifier_net - register a per-netns network notifier block
1915 * @net: network namespace
1916 * @nb: notifier
1917 *
1918 * Register a notifier to be called when network device events occur.
1919 * The notifier passed is linked into the kernel structures and must
1920 * not be reused until it has been unregistered. A negative errno code
1921 * is returned on a failure.
1922 *
1923 * When registered all registration and up events are replayed
1924 * to the new notifier to allow device to have a race free
1925 * view of the network device list.
1926 */
1927
1928int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1929{
1930        int err;
1931
1932        rtnl_lock();
1933        err = __register_netdevice_notifier_net(net, nb, false);
1934        rtnl_unlock();
1935        return err;
1936}
1937EXPORT_SYMBOL(register_netdevice_notifier_net);
1938
1939/**
1940 * unregister_netdevice_notifier_net - unregister a per-netns
1941 *                                     network notifier block
1942 * @net: network namespace
1943 * @nb: notifier
1944 *
1945 * Unregister a notifier previously registered by
1946 * register_netdevice_notifier(). The notifier is unlinked into the
1947 * kernel structures and may then be reused. A negative errno code
1948 * is returned on a failure.
1949 *
1950 * After unregistering unregister and down device events are synthesized
1951 * for all devices on the device list to the removed notifier to remove
1952 * the need for special case cleanup code.
1953 */
1954
1955int unregister_netdevice_notifier_net(struct net *net,
1956                                      struct notifier_block *nb)
1957{
1958        int err;
1959
1960        rtnl_lock();
1961        err = __unregister_netdevice_notifier_net(net, nb);
1962        rtnl_unlock();
1963        return err;
1964}
1965EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1966
1967int register_netdevice_notifier_dev_net(struct net_device *dev,
1968                                        struct notifier_block *nb,
1969                                        struct netdev_net_notifier *nn)
1970{
1971        int err;
1972
1973        rtnl_lock();
1974        err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1975        if (!err) {
1976                nn->nb = nb;
1977                list_add(&nn->list, &dev->net_notifier_list);
1978        }
1979        rtnl_unlock();
1980        return err;
1981}
1982EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1983
1984int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1985                                          struct notifier_block *nb,
1986                                          struct netdev_net_notifier *nn)
1987{
1988        int err;
1989
1990        rtnl_lock();
1991        list_del(&nn->list);
1992        err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
1993        rtnl_unlock();
1994        return err;
1995}
1996EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
1997
1998static void move_netdevice_notifiers_dev_net(struct net_device *dev,
1999                                             struct net *net)
2000{

2001        struct netdev_net_notifier *nn;
2002
2003        list_for_each_entry(nn, &dev->net_notifier_list, list) {
2004                __unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
2005                __register_netdevice_notifier_net(net, nn->nb, true);
2006        }
2007}
2008
2009/**
2010 *      call_netdevice_notifiers_info - call all network notifier blocks
2011 *      @val: value passed unmodified to notifier function
2012 *      @info: notifier information data
2013 *
2014 *      Call all network notifier blocks.  Parameters and return value
2015 *      are as for raw_notifier_call_chain().
2016 */
2017
2018static int call_netdevice_notifiers_info(unsigned long val,
2019                                         struct netdev_notifier_info *info)
2020{
2021        struct net *net = dev_net(info->dev);
2022        int ret;
2023
2024        ASSERT_RTNL();
2025
2026        /* Run per-netns notifier block chain first, then run the global one.
2027         * Hopefully, one day, the global one is going to be removed after
2028         * all notifier block registrators get converted to be per-netns.
2029         */
2030        ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
2031        if (ret & NOTIFY_STOP_MASK)
2032                return ret;
2033        return raw_notifier_call_chain(&netdev_chain, val, info);
2034}
2035
2036static int call_netdevice_notifiers_extack(unsigned long val,
2037                                           struct net_device *dev,
2038                                           struct netlink_ext_ack *extack)
2039{
2040        struct netdev_notifier_info info = {
2041                .dev = dev,
2042                .extack = extack,
2043        };
2044
2045        return call_netdevice_notifiers_info(val, &info);
2046}
2047
2048/**
2049 *      call_netdevice_notifiers - call all network notifier blocks
2050 *      @val: value passed unmodified to notifier function
2051 *      @dev: net_device pointer passed unmodified to notifier function
2052 *
2053 *      Call all network notifier blocks.  Parameters and return value
2054 *      are as for raw_notifier_call_chain().
2055 */
2056
2057int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2058{
2059        return call_netdevice_notifiers_extack(val, dev, NULL);
2060}
2061EXPORT_SYMBOL(call_netdevice_notifiers);
2062
2063/**
2064 *      call_netdevice_notifiers_mtu - call all network notifier blocks
2065 *      @val: value passed unmodified to notifier function
2066 *      @dev: net_device pointer passed unmodified to notifier function
2067 *      @arg: additional u32 argument passed to the notifier function
2068 *
2069 *      Call all network notifier blocks.  Parameters and return value
2070 *      are as for raw_notifier_call_chain().
2071 */
2072static int call_netdevice_notifiers_mtu(unsigned long val,
2073                                        struct net_device *dev, u32 arg)
2074{
2075        struct netdev_notifier_info_ext info = {
2076                .info.dev = dev,
2077                .ext.mtu = arg,
2078        };
2079
2080        BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2081
2082        return call_netdevice_notifiers_info(val, &info.info);
2083}
2084
2085#ifdef CONFIG_NET_INGRESS
2086static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2087
2088void net_inc_ingress_queue(void)
2089{
2090        static_branch_inc(&ingress_needed_key);
2091}
2092EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2093
2094void net_dec_ingress_queue(void)
2095{
2096        static_branch_dec(&ingress_needed_key);
2097}
2098EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2099#endif
2100
2101#ifdef CONFIG_NET_EGRESS
2102static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2103
2104void net_inc_egress_queue(void)
2105{
2106        static_branch_inc(&egress_needed_key);
2107}
2108EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2109
2110void net_dec_egress_queue(void)
2111{
2112        static_branch_dec(&egress_needed_key);
2113}
2114EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2115#endif
2116
2117static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2118#ifdef CONFIG_JUMP_LABEL
2119static atomic_t netstamp_needed_deferred;
2120static atomic_t netstamp_wanted;
2121static void netstamp_clear(struct work_struct *work)
2122{
2123        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2124        int wanted;
2125
2126        wanted = atomic_add_return(deferred, &netstamp_wanted);
2127        if (wanted > 0)
2128                static_branch_enable(&netstamp_needed_key);
2129        else
2130                static_branch_disable(&netstamp_needed_key);
2131}
2132static DECLARE_WORK(netstamp_work, netstamp_clear);
2133#endif
2134
2135void net_enable_timestamp(void)
2136{
2137#ifdef CONFIG_JUMP_LABEL
2138        int wanted;
2139
2140        while (1) {
2141                wanted = atomic_read(&netstamp_wanted);
2142                if (wanted <= 0)
2143                        break;
2144                if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
2145                        return;
2146        }
2147        atomic_inc(&netstamp_needed_deferred);
2148        schedule_work(&netstamp_work);
2149#else
2150        static_branch_inc(&netstamp_needed_key);
2151#endif
2152}
2153EXPORT_SYMBOL(net_enable_timestamp);
2154
2155void net_disable_timestamp(void)
2156{
2157#ifdef CONFIG_JUMP_LABEL
2158        int wanted;
2159
2160        while (1) {
2161                wanted = atomic_read(&netstamp_wanted);
2162                if (wanted <= 1)
2163                        break;
2164                if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
2165                        return;
2166        }
2167        atomic_dec(&netstamp_needed_deferred);
2168        schedule_work(&netstamp_work);
2169#else
2170        static_branch_dec(&netstamp_needed_key);
2171#endif
2172}
2173EXPORT_SYMBOL(net_disable_timestamp);
2174
2175static inline void net_timestamp_set(struct sk_buff *skb)
2176{
2177        skb->tstamp = 0;
2178        if (static_branch_unlikely(&netstamp_needed_key))
2179                __net_timestamp(skb);
2180}
2181
2182#define net_timestamp_check(COND, SKB)                          \
2183        if (static_branch_unlikely(&netstamp_needed_key)) {     \
2184                if ((COND) && !(SKB)->tstamp)                   \
2185                        __net_timestamp(SKB);                   \
2186        }                                                       \
2187
2188bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2189{
2190        unsigned int len;
2191
2192        if (!(dev->flags & IFF_UP))
2193                return false;
2194
2195        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
2196        if (skb->len <= len)
2197                return true;
2198
2199        /* if TSO is enabled, we don't care about the length as the packet
2200         * could be forwarded without being segmented before
2201         */
2202        if (skb_is_gso(skb))
2203                return true;
2204
2205        return false;
2206}
2207EXPORT_SYMBOL_GPL(is_skb_forwardable);
2208
2209int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2210{
2211        int ret = ____dev_forward_skb(dev, skb);
2212
2213        if (likely(!ret)) {
2214                skb->protocol = eth_type_trans(skb, dev);
2215                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2216        }
2217
2218        return ret;
2219}
2220EXPORT_SYMBOL_GPL(__dev_forward_skb);
2221
2222/**
2223 * dev_forward_skb - loopback an skb to another netif
2224 *
2225 * @dev: destination network device
2226 * @skb: buffer to forward
2227 *
2228 * return values:
2229 *      NET_RX_SUCCESS  (no congestion)
2230 *      NET_RX_DROP     (packet was dropped, but freed)
2231 *
2232 * dev_forward_skb can be used for injecting an skb from the
2233 * start_xmit function of one device into the receive queue
2234 * of another device.
2235 *
2236 * The receiving device may be in another namespace, so
2237 * we have to clear all information in the skb that could
2238 * impact namespace isolation.
2239 */
2240int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2241{
2242        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2243}
2244EXPORT_SYMBOL_GPL(dev_forward_skb);
2245
2246static inline int deliver_skb(struct sk_buff *skb,
2247                              struct packet_type *pt_prev,
2248                              struct net_device *orig_dev)
2249{
2250        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2251                return -ENOMEM;
2252        refcount_inc(&skb->users);
2253        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2254}
2255
2256static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2257                                          struct packet_type **pt,
2258                                          struct net_device *orig_dev,
2259                                          __be16 type,
2260                                          struct list_head *ptype_list)
2261{
2262        struct packet_type *ptype, *pt_prev = *pt;
2263
2264        list_for_each_entry_rcu(ptype, ptype_list, list) {
2265                if (ptype->type != type)
2266                        continue;
2267                if (pt_prev)
2268                        deliver_skb(skb, pt_prev, orig_dev);
2269                pt_prev = ptype;
2270        }
2271        *pt = pt_prev;
2272}
2273
2274static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2275{
2276        if (!ptype->af_packet_priv || !skb->sk)
2277                return false;
2278
2279        if (ptype->id_match)
2280                return ptype->id_match(ptype, skb->sk);
2281        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2282                return true;
2283
2284        return false;
2285}
2286
2287/**
2288 * dev_nit_active - return true if any network interface taps are in use
2289 *
2290 * @dev: network device to check for the presence of taps
2291 */
2292bool dev_nit_active(struct net_device *dev)
2293{
2294        return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2295}
2296EXPORT_SYMBOL_GPL(dev_nit_active);
2297
2298/*
2299 *      Support routine. Sends outgoing frames to any network
2300 *      taps currently in use.
2301 */
2302
2303void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2304{
2305        struct packet_type *ptype;
2306        struct sk_buff *skb2 = NULL;
2307        struct packet_type *pt_prev = NULL;
2308        struct list_head *ptype_list = &ptype_all;
2309
2310        rcu_read_lock();
2311again:
2312        list_for_each_entry_rcu(ptype, ptype_list, list) {
2313                if (ptype->ignore_outgoing)
2314                        continue;
2315
2316                /* Never send packets back to the socket
2317                 * they originated from - MvS (miquels@drinkel.ow.org)
2318                 */
2319                if (skb_loop_sk(ptype, skb))
2320                        continue;
2321
2322                if (pt_prev) {
2323                        deliver_skb(skb2, pt_prev, skb->dev);
2324                        pt_prev = ptype;
2325                        continue;
2326                }
2327
2328                /* need to clone skb, done only once */
2329                skb2 = skb_clone(skb, GFP_ATOMIC);
2330                if (!skb2)
2331                        goto out_unlock;
2332
2333                net_timestamp_set(skb2);
2334
2335                /* skb->nh should be correctly
2336                 * set by sender, so that the second statement is
2337                 * just protection against buggy protocols.
2338                 */
2339                skb_reset_mac_header(skb2);
2340
2341                if (skb_network_header(skb2) < skb2->data ||
2342                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2343                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2344                                             ntohs(skb2->protocol),
2345                                             dev->name);
2346                        skb_reset_network_header(skb2);
2347                }
2348
2349                skb2->transport_header = skb2->network_header;
2350                skb2->pkt_type = PACKET_OUTGOING;
2351                pt_prev = ptype;
2352        }
2353
2354        if (ptype_list == &ptype_all) {
2355                ptype_list = &dev->ptype_all;
2356                goto again;
2357        }
2358out_unlock:
2359        if (pt_prev) {
2360                if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2361                        pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2362                else
2363                        kfree_skb(skb2);
2364        }
2365        rcu_read_unlock();
2366}
2367EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2368
2369/**
2370 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2371 * @dev: Network device
2372 * @txq: number of queues available
2373 *
2374 * If real_num_tx_queues is changed the tc mappings may no longer be
2375 * valid. To resolve this verify the tc mapping remains valid and if
2376 * not NULL the mapping. With no priorities mapping to this
2377 * offset/count pair it will no longer be used. In the worst case TC0
2378 * is invalid nothing can be done so disable priority mappings. If is
2379 * expected that drivers will fix this mapping if they can before
2380 * calling netif_set_real_num_tx_queues.
2381 */
2382static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2383{
2384        int i;
2385        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2386
2387        /* If TC0 is invalidated disable TC mapping */
2388        if (tc->offset + tc->count > txq) {
2389                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2390                dev->num_tc = 0;
2391                return;
2392        }
2393
2394        /* Invalidated prio to tc mappings set to TC0 */
2395        for (i = 1; i < TC_BITMASK + 1; i++) {
2396                int q = netdev_get_prio_tc_map(dev, i);
2397
2398                tc = &dev->tc_to_txq[q];
2399                if (tc->offset + tc->count > txq) {
2400                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2401                                i, q);
2402                        netdev_set_prio_tc_map(dev, i, 0);
2403                }
2404        }
2405}
2406
2407int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2408{
2409        if (dev->num_tc) {
2410                struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2411                int i;
2412
2413                /* walk through the TCs and see if it falls into any of them */
2414                for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2415                        if ((txq - tc->offset) < tc->count)
2416                                return i;
2417                }
2418
2419                /* didn't find it, just return -1 to indicate no match */
2420                return -1;
2421        }
2422
2423        return 0;
2424}
2425EXPORT_SYMBOL(netdev_txq_to_tc);
2426
2427#ifdef CONFIG_XPS
2428struct static_key xps_needed __read_mostly;
2429EXPORT_SYMBOL(xps_needed);
2430struct static_key xps_rxqs_needed __read_mostly;
2431EXPORT_SYMBOL(xps_rxqs_needed);
2432static DEFINE_MUTEX(xps_map_mutex);
2433#define xmap_dereference(P)             \
2434        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2435
2436static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2437                             int tci, u16 index)
2438{
2439        struct xps_map *map = NULL;
2440        int pos;
2441
2442        if (dev_maps)
2443                map = xmap_dereference(dev_maps->attr_map[tci]);
2444        if (!map)
2445                return false;
2446
2447        for (pos = map->len; pos--;) {
2448                if (map->queues[pos] != index)
2449                        continue;
2450
2451                if (map->len > 1) {
2452                        map->queues[pos] = map->queues[--map->len];
2453                        break;
2454                }
2455
2456                RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2457                kfree_rcu(map, rcu);
2458                return false;
2459        }
2460
2461        return true;
2462}
2463
2464static bool remove_xps_queue_cpu(struct net_device *dev,
2465                                 struct xps_dev_maps *dev_maps,
2466                                 int cpu, u16 offset, u16 count)
2467{
2468        int num_tc = dev->num_tc ? : 1;
2469        bool active = false;
2470        int tci;
2471
2472        for (tci = cpu * num_tc; num_tc--; tci++) {
2473                int i, j;
2474
2475                for (i = count, j = offset; i--; j++) {
2476                        if (!remove_xps_queue(dev_maps, tci, j))
2477                                break;
2478                }
2479
2480                active |= i < 0;
2481        }
2482
2483        return active;
2484}
2485
2486static void reset_xps_maps(struct net_device *dev,
2487                           struct xps_dev_maps *dev_maps,
2488                           bool is_rxqs_map)
2489{
2490        if (is_rxqs_map) {
2491                static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2492                RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2493        } else {
2494                RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2495        }
2496        static_key_slow_dec_cpuslocked(&xps_needed);
2497        kfree_rcu(dev_maps, rcu);
2498}
2499
2500static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
2501                           struct xps_dev_maps *dev_maps, unsigned int nr_ids,
2502                           u16 offset, u16 count, bool is_rxqs_map)
2503{
2504        bool active = false;
2505        int i, j;
2506
2507        for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
2508             j < nr_ids;)
2509                active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
2510                                               count);
2511        if (!active)
2512                reset_xps_maps(dev, dev_maps, is_rxqs_map);
2513
2514        if (!is_rxqs_map) {
2515                for (i = offset + (count - 1); count--; i--) {
2516                        netdev_queue_numa_node_write(
2517                                netdev_get_tx_queue(dev, i),
2518                                NUMA_NO_NODE);
2519                }
2520        }
2521}
2522
2523static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2524                                   u16 count)
2525{
2526        const unsigned long *possible_mask = NULL;
2527        struct xps_dev_maps *dev_maps;
2528        unsigned int nr_ids;
2529
2530        if (!static_key_false(&xps_needed))
2531                return;
2532
2533        cpus_read_lock();
2534        mutex_lock(&xps_map_mutex);
2535
2536        if (static_key_false(&xps_rxqs_needed)) {
2537                dev_maps = xmap_dereference(dev->xps_rxqs_map);
2538                if (dev_maps) {
2539                        nr_ids = dev->num_rx_queues;
2540                        clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
2541                                       offset, count, true);
2542                }
2543        }
2544
2545        dev_maps = xmap_dereference(dev->xps_cpus_map);
2546        if (!dev_maps)
2547                goto out_no_maps;
2548
2549        if (num_possible_cpus() > 1)
2550                possible_mask = cpumask_bits(cpu_possible_mask);
2551        nr_ids = nr_cpu_ids;
2552        clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
2553                       false);
2554
2555out_no_maps:
2556        mutex_unlock(&xps_map_mutex);
2557        cpus_read_unlock();
2558}
2559
2560static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2561{
2562        netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2563}
2564
2565static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2566                                      u16 index, bool is_rxqs_map)
2567{
2568        struct xps_map *new_map;
2569        int alloc_len = XPS_MIN_MAP_ALLOC;
2570        int i, pos;
2571
2572        for (pos = 0; map && pos < map->len; pos++) {
2573                if (map->queues[pos] != index)
2574                        continue;
2575                return map;
2576        }
2577
2578        /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2579        if (map) {
2580                if (pos < map->alloc_len)
2581                        return map;
2582
2583                alloc_len = map->alloc_len * 2;
2584        }
2585
2586        /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2587         *  map
2588         */
2589        if (is_rxqs_map)
2590                new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2591        else
2592                new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2593                                       cpu_to_node(attr_index));
2594        if (!new_map)
2595                return NULL;
2596
2597        for (i = 0; i < pos; i++)
2598                new_map->queues[i] = map->queues[i];
2599        new_map->alloc_len = alloc_len;
2600        new_map->len = pos;
2601
2602        return new_map;
2603}
2604
2605/* Must be called under cpus_read_lock */
2606int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2607                          u16 index, bool is_rxqs_map)
2608{
2609        const unsigned long *online_mask = NULL, *possible_mask = NULL;
2610        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2611        int i, j, tci, numa_node_id = -2;
2612        int maps_sz, num_tc = 1, tc = 0;
2613        struct xps_map *map, *new_map;
2614        bool active = false;
2615        unsigned int nr_ids;
2616
2617        if (dev->num_tc) {
2618                /* Do not allow XPS on subordinate device directly */
2619                num_tc = dev->num_tc;
2620                if (num_tc < 0)
2621                        return -EINVAL;
2622
2623                /* If queue belongs to subordinate dev use its map */
2624                dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2625
2626                tc = netdev_txq_to_tc(dev, index);
2627                if (tc < 0)
2628                        return -EINVAL;
2629        }
2630
2631        mutex_lock(&xps_map_mutex);
2632        if (is_rxqs_map) {
2633                maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2634                dev_maps = xmap_dereference(dev->xps_rxqs_map);
2635                nr_ids = dev->num_rx_queues;
2636        } else {
2637                maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2638                if (num_possible_cpus() > 1) {
2639                        online_mask = cpumask_bits(cpu_online_mask);
2640                        possible_mask = cpumask_bits(cpu_possible_mask);
2641                }
2642                dev_maps = xmap_dereference(dev->xps_cpus_map);
2643                nr_ids = nr_cpu_ids;
2644        }
2645
2646        if (maps_sz < L1_CACHE_BYTES)
2647                maps_sz = L1_CACHE_BYTES;
2648
2649        /* allocate memory for queue storage */
2650        for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2651             j < nr_ids;) {
2652                if (!new_dev_maps)
2653                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2654                if (!new_dev_maps) {
2655                        mutex_unlock(&xps_map_mutex);
2656                        return -ENOMEM;
2657                }
2658
2659                tci = j * num_tc + tc;
2660                map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2661                                 NULL;
2662
2663                map = expand_xps_map(map, j, index, is_rxqs_map);
2664                if (!map)
2665                        goto error;
2666
2667                RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2668        }
2669
2670        if (!new_dev_maps)
2671                goto out_no_new_maps;
2672
2673        if (!dev_maps) {
2674                /* Increment static keys at most once per type */
2675                static_key_slow_inc_cpuslocked(&xps_needed);
2676                if (is_rxqs_map)
2677                        static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2678        }
2679
2680        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2681             j < nr_ids;) {
2682                /* copy maps belonging to foreign traffic classes */
2683                for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2684                        /* fill in the new device map from the old device map */
2685                        map = xmap_dereference(dev_maps->attr_map[tci]);
2686                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2687                }
2688
2689                /* We need to explicitly update tci as prevous loop
2690                 * could break out early if dev_maps is NULL.
2691                 */
2692                tci = j * num_tc + tc;
2693
2694                if (netif_attr_test_mask(j, mask, nr_ids) &&
2695                    netif_attr_test_online(j, online_mask, nr_ids)) {
2696                        /* add tx-queue to CPU/rx-queue maps */
2697                        int pos = 0;
2698
2699                        map = xmap_dereference(new_dev_maps->attr_map[tci]);
2700                        while ((pos < map->len) && (map->queues[pos] != index))
2701                                pos++;
2702
2703                        if (pos == map->len)
2704                                map->queues[map->len++] = index;
2705#ifdef CONFIG_NUMA
2706                        if (!is_rxqs_map) {
2707                                if (numa_node_id == -2)
2708                                        numa_node_id = cpu_to_node(j);
2709                                else if (numa_node_id != cpu_to_node(j))
2710                                        numa_node_id = -1;
2711                        }
2712#endif
2713                } else if (dev_maps) {
2714                        /* fill in the new device map from the old device map */
2715                        map = xmap_dereference(dev_maps->attr_map[tci]);
2716                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2717                }
2718
2719                /* copy maps belonging to foreign traffic classes */
2720                for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2721                        /* fill in the new device map from the old device map */
2722                        map = xmap_dereference(dev_maps->attr_map[tci]);
2723                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2724                }
2725        }
2726
2727        if (is_rxqs_map)
2728                rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
2729        else
2730                rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2731
2732        /* Cleanup old maps */
2733        if (!dev_maps)
2734                goto out_no_old_maps;
2735
2736        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2737             j < nr_ids;) {
2738                for (i = num_tc, tci = j * num_tc; i--; tci++) {
2739                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2740                        map = xmap_dereference(dev_maps->attr_map[tci]);
2741                        if (map && map != new_map)
2742                                kfree_rcu(map, rcu);
2743                }
2744        }
2745
2746        kfree_rcu(dev_maps, rcu);
2747
2748out_no_old_maps:
2749        dev_maps = new_dev_maps;
2750        active = true;
2751
2752out_no_new_maps:
2753        if (!is_rxqs_map) {
2754                /* update Tx queue numa node */
2755                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2756                                             (numa_node_id >= 0) ?
2757                                             numa_node_id : NUMA_NO_NODE);
2758        }
2759
2760        if (!dev_maps)
2761                goto out_no_maps;
2762
2763        /* removes tx-queue from unused CPUs/rx-queues */
2764        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2765             j < nr_ids;) {
2766                for (i = tc, tci = j * num_tc; i--; tci++)
2767                        active |= remove_xps_queue(dev_maps, tci, index);
2768                if (!netif_attr_test_mask(j, mask, nr_ids) ||
2769                    !netif_attr_test_online(j, online_mask, nr_ids))
2770                        active |= remove_xps_queue(dev_maps, tci, index);
2771                for (i = num_tc - tc, tci++; --i; tci++)
2772                        active |= remove_xps_queue(dev_maps, tci, index);
2773        }
2774
2775        /* free map if not active */
2776        if (!active)
2777                reset_xps_maps(dev, dev_maps, is_rxqs_map);
2778
2779out_no_maps:
2780        mutex_unlock(&xps_map_mutex);
2781
2782        return 0;
2783error:
2784        /* remove any maps that we added */
2785        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2786             j < nr_ids;) {
2787                for (i = num_tc, tci = j * num_tc; i--; tci++) {
2788                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2789                        map = dev_maps ?
2790                              xmap_dereference(dev_maps->attr_map[tci]) :
2791                              NULL;
2792                        if (new_map && new_map != map)
2793                                kfree(new_map);
2794                }
2795        }
2796
2797        mutex_unlock(&xps_map_mutex);
2798
2799        kfree(new_dev_maps);
2800        return -ENOMEM;
2801}
2802EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2803
2804int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2805                        u16 index)
2806{
2807        int ret;
2808
2809        cpus_read_lock();
2810        ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
2811        cpus_read_unlock();
2812
2813        return ret;
2814}
2815EXPORT_SYMBOL(netif_set_xps_queue);
2816
2817#endif
2818static void netdev_unbind_all_sb_channels(struct net_device *dev)
2819{
2820        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2821
2822        /* Unbind any subordinate channels */
2823        while (txq-- != &dev->_tx[0]) {
2824                if (txq->sb_dev)
2825                        netdev_unbind_sb_channel(dev, txq->sb_dev);
2826        }
2827}
2828
2829void netdev_reset_tc(struct net_device *dev)
2830{
2831#ifdef CONFIG_XPS
2832        netif_reset_xps_queues_gt(dev, 0);
2833#endif
2834        netdev_unbind_all_sb_channels(dev);
2835
2836        /* Reset TC configuration of device */
2837        dev->num_tc = 0;
2838        memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2839        memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2840}
2841EXPORT_SYMBOL(netdev_reset_tc);
2842
2843int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2844{
2845        if (tc >= dev->num_tc)
2846                return -EINVAL;
2847
2848#ifdef CONFIG_XPS
2849        netif_reset_xps_queues(dev, offset, count);
2850#endif
2851        dev->tc_to_txq[tc].count = count;
2852        dev->tc_to_txq[tc].offset = offset;
2853        return 0;
2854}
2855EXPORT_SYMBOL(netdev_set_tc_queue);
2856
2857int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2858{
2859        if (num_tc > TC_MAX_QUEUE)
2860                return -EINVAL;
2861
2862#ifdef CONFIG_XPS
2863        netif_reset_xps_queues_gt(dev, 0);
2864#endif
2865        netdev_unbind_all_sb_channels(dev);
2866
2867        dev->num_tc = num_tc;
2868        return 0;
2869}
2870EXPORT_SYMBOL(netdev_set_num_tc);
2871
2872void netdev_unbind_sb_channel(struct net_device *dev,
2873                              struct net_device *sb_dev)
2874{
2875        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2876
2877#ifdef CONFIG_XPS
2878        netif_reset_xps_queues_gt(sb_dev, 0);
2879#endif
2880        memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2881        memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2882
2883        while (txq-- != &dev->_tx[0]) {
2884                if (txq->sb_dev == sb_dev)
2885                        txq->sb_dev = NULL;
2886        }
2887}
2888EXPORT_SYMBOL(netdev_unbind_sb_channel);
2889
2890int netdev_bind_sb_channel_queue(struct net_device *dev,
2891                                 struct net_device *sb_dev,
2892                                 u8 tc, u16 count, u16 offset)
2893{
2894        /* Make certain the sb_dev and dev are already configured */
2895        if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2896                return -EINVAL;
2897
2898        /* We cannot hand out queues we don't have */
2899        if ((offset + count) > dev->real_num_tx_queues)
2900                return -EINVAL;
2901
2902        /* Record the mapping */
2903        sb_dev->tc_to_txq[tc].count = count;
2904        sb_dev->tc_to_txq[tc].offset = offset;
2905
2906        /* Provide a way for Tx queue to find the tc_to_txq map or
2907         * XPS map for itself.
2908         */
2909        while (count--)
2910                netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2911
2912        return 0;
2913}
2914EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2915
2916int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2917{
2918        /* Do not use a multiqueue device to represent a subordinate channel */
2919        if (netif_is_multiqueue(dev))
2920                return -ENODEV;
2921
2922        /* We allow channels 1 - 32767 to be used for subordinate channels.
2923         * Channel 0 is meant to be "native" mode and used only to represent
2924         * the main root device. We allow writing 0 to reset the device back
2925         * to normal mode after being used as a subordinate channel.
2926         */
2927        if (channel > S16_MAX)
2928                return -EINVAL;
2929
2930        dev->num_tc = -channel;
2931
2932        return 0;
2933}
2934EXPORT_SYMBOL(netdev_set_sb_channel);
2935
2936/*
2937 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2938 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2939 */
2940int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2941{
2942        bool disabling;
2943        int rc;
2944
2945        disabling = txq < dev->real_num_tx_queues;
2946
2947        if (txq < 1 || txq > dev->num_tx_queues)
2948                return -EINVAL;
2949
2950        if (dev->reg_state == NETREG_REGISTERED ||
2951            dev->reg_state == NETREG_UNREGISTERING) {
2952                ASSERT_RTNL();
2953
2954                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2955                                                  txq);
2956                if (rc)
2957                        return rc;
2958
2959                if (dev->num_tc)
2960                        netif_setup_tc(dev, txq);
2961
2962                dev->real_num_tx_queues = txq;
2963
2964                if (disabling) {
2965                        synchronize_net();
2966                        qdisc_reset_all_tx_gt(dev, txq);
2967#ifdef CONFIG_XPS
2968                        netif_reset_xps_queues_gt(dev, txq);
2969#endif
2970                }
2971        } else {
2972                dev->real_num_tx_queues = txq;
2973        }
2974
2975        return 0;
2976}
2977EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2978
2979#ifdef CONFIG_SYSFS
2980/**
2981 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2982 *      @dev: Network device
2983 *      @rxq: Actual number of RX queues
2984 *
2985 *      This must be called either with the rtnl_lock held or before
2986 *      registration of the net device.  Returns 0 on success, or a
2987 *      negative error code.  If called before registration, it always
2988 *      succeeds.
2989 */
2990int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2991{
2992        int rc;
2993
2994        if (rxq < 1 || rxq > dev->num_rx_queues)
2995                return -EINVAL;
2996
2997        if (dev->reg_state == NETREG_REGISTERED) {
2998                ASSERT_RTNL();
2999
3000                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,

3001                                                  rxq);
3002                if (rc)
3003                        return rc;
3004        }
3005
3006        dev->real_num_rx_queues = rxq;
3007        return 0;
3008}
3009EXPORT_SYMBOL(netif_set_real_num_rx_queues);
3010#endif
3011
3012/**
3013 * netif_get_num_default_rss_queues - default number of RSS queues
3014 *
3015 * This routine should set an upper limit on the number of RSS queues
3016 * used by default by multiqueue devices.
3017 */
3018int netif_get_num_default_rss_queues(void)
3019{
3020        return is_kdump_kernel() ?
3021                1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
3022}
3023EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3024
3025static void __netif_reschedule(struct Qdisc *q)
3026{
3027        struct softnet_data *sd;
3028        unsigned long flags;
3029
3030        local_irq_save(flags);
3031        sd = this_cpu_ptr(&softnet_data);
3032        q->next_sched = NULL;
3033        *sd->output_queue_tailp = q;
3034        sd->output_queue_tailp = &q->next_sched;
3035        raise_softirq_irqoff(NET_TX_SOFTIRQ);
3036        local_irq_restore(flags);
3037}
3038
3039void __netif_schedule(struct Qdisc *q)
3040{
3041        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3042                __netif_reschedule(q);
3043}
3044EXPORT_SYMBOL(__netif_schedule);
3045
3046struct dev_kfree_skb_cb {
3047        enum skb_free_reason reason;
3048};
3049
3050static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3051{
3052        return (struct dev_kfree_skb_cb *)skb->cb;
3053}
3054
3055void netif_schedule_queue(struct netdev_queue *txq)
3056{
3057        rcu_read_lock();
3058        if (!netif_xmit_stopped(txq)) {
3059                struct Qdisc *q = rcu_dereference(txq->qdisc);
3060
3061                __netif_schedule(q);
3062        }
3063        rcu_read_unlock();
3064}
3065EXPORT_SYMBOL(netif_schedule_queue);
3066
3067void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3068{
3069        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3070                struct Qdisc *q;
3071
3072                rcu_read_lock();
3073                q = rcu_dereference(dev_queue->qdisc);
3074                __netif_schedule(q);
3075                rcu_read_unlock();
3076        }
3077}
3078EXPORT_SYMBOL(netif_tx_wake_queue);
3079
3080void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
3081{
3082        unsigned long flags;
3083
3084        if (unlikely(!skb))
3085                return;
3086
3087        if (likely(refcount_read(&skb->users) == 1)) {
3088                smp_rmb();
3089                refcount_set(&skb->users, 0);
3090        } else if (likely(!refcount_dec_and_test(&skb->users))) {
3091                return;
3092        }
3093        get_kfree_skb_cb(skb)->reason = reason;
3094        local_irq_save(flags);
3095        skb->next = __this_cpu_read(softnet_data.completion_queue);
3096        __this_cpu_write(softnet_data.completion_queue, skb);
3097        raise_softirq_irqoff(NET_TX_SOFTIRQ);
3098        local_irq_restore(flags);
3099}
3100EXPORT_SYMBOL(__dev_kfree_skb_irq);
3101
3102void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
3103{
3104        if (in_irq() || irqs_disabled())
3105                __dev_kfree_skb_irq(skb, reason);
3106        else
3107                dev_kfree_skb(skb);
3108}
3109EXPORT_SYMBOL(__dev_kfree_skb_any);
3110
3111
3112/**
3113 * netif_device_detach - mark device as removed
3114 * @dev: network device
3115 *
3116 * Mark device as removed from system and therefore no longer available.
3117 */
3118void netif_device_detach(struct net_device *dev)
3119{
3120        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3121            netif_running(dev)) {
3122                netif_tx_stop_all_queues(dev);
3123        }
3124}
3125EXPORT_SYMBOL(netif_device_detach);
3126
3127/**
3128 * netif_device_attach - mark device as attached
3129 * @dev: network device
3130 *
3131 * Mark device as attached from system and restart if needed.
3132 */
3133void netif_device_attach(struct net_device *dev)
3134{
3135        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3136            netif_running(dev)) {
3137                netif_tx_wake_all_queues(dev);
3138                __netdev_watchdog_up(dev);
3139        }
3140}
3141EXPORT_SYMBOL(netif_device_attach);
3142
3143/*
3144 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3145 * to be used as a distribution range.
3146 */
3147static u16 skb_tx_hash(const struct net_device *dev,
3148                       const struct net_device *sb_dev,
3149                       struct sk_buff *skb)
3150{
3151        u32 hash;
3152        u16 qoffset = 0;
3153        u16 qcount = dev->real_num_tx_queues;
3154
3155        if (dev->num_tc) {
3156                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3157
3158                qoffset = sb_dev->tc_to_txq[tc].offset;
3159                qcount = sb_dev->tc_to_txq[tc].count;
3160        }
3161
3162        if (skb_rx_queue_recorded(skb)) {
3163                hash = skb_get_rx_queue(skb);
3164                if (hash >= qoffset)
3165                        hash -= qoffset;
3166                while (unlikely(hash >= qcount))
3167                        hash -= qcount;
3168                return hash + qoffset;
3169        }
3170
3171        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3172}
3173
3174static void skb_warn_bad_offload(const struct sk_buff *skb)
3175{
3176        static const netdev_features_t null_features;
3177        struct net_device *dev = skb->dev;
3178        const char *name = "";
3179
3180        if (!net_ratelimit())
3181                return;
3182
3183        if (dev) {
3184                if (dev->dev.parent)
3185                        name = dev_driver_string(dev->dev.parent);
3186                else
3187                        name = netdev_name(dev);
3188        }
3189        skb_dump(KERN_WARNING, skb, false);
3190        WARN(1, "%s: caps=(%pNF, %pNF)\n",
3191             name, dev ? &dev->features : &null_features,
3192             skb->sk ? &skb->sk->sk_route_caps : &null_features);
3193}
3194
3195/*
3196 * Invalidate hardware checksum when packet is to be mangled, and
3197 * complete checksum manually on outgoing path.
3198 */
3199int skb_checksum_help(struct sk_buff *skb)
3200{
3201        __wsum csum;
3202        int ret = 0, offset;
3203
3204        if (skb->ip_summed == CHECKSUM_COMPLETE)
3205                goto out_set_summed;
3206
3207        if (unlikely(skb_shinfo(skb)->gso_size)) {
3208                skb_warn_bad_offload(skb);
3209                return -EINVAL;
3210        }
3211
3212        /* Before computing a checksum, we should make sure no frag could
3213         * be modified by an external entity : checksum could be wrong.
3214         */
3215        if (skb_has_shared_frag(skb)) {
3216                ret = __skb_linearize(skb);
3217                if (ret)
3218                        goto out;
3219        }
3220
3221        offset = skb_checksum_start_offset(skb);
3222        BUG_ON(offset >= skb_headlen(skb));
3223        csum = skb_checksum(skb, offset, skb->len - offset, 0);
3224
3225        offset += skb->csum_offset;
3226        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
3227
3228        ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3229        if (ret)
3230                goto out;
3231
3232        *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3233out_set_summed:
3234        skb->ip_summed = CHECKSUM_NONE;
3235out:
3236        return ret;
3237}
3238EXPORT_SYMBOL(skb_checksum_help);
3239
3240int skb_crc32c_csum_help(struct sk_buff *skb)
3241{
3242        __le32 crc32c_csum;
3243        int ret = 0, offset, start;
3244
3245        if (skb->ip_summed != CHECKSUM_PARTIAL)
3246                goto out;
3247
3248        if (unlikely(skb_is_gso(skb)))
3249                goto out;
3250
3251        /* Before computing a checksum, we should make sure no frag could
3252         * be modified by an external entity : checksum could be wrong.
3253         */
3254        if (unlikely(skb_has_shared_frag(skb))) {
3255                ret = __skb_linearize(skb);
3256                if (ret)
3257                        goto out;
3258        }
3259        start = skb_checksum_start_offset(skb);
3260        offset = start + offsetof(struct sctphdr, checksum);
3261        if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3262                ret = -EINVAL;
3263                goto out;
3264        }
3265
3266        ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3267        if (ret)
3268                goto out;
3269
3270        crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3271                                                  skb->len - start, ~(__u32)0,
3272                                                  crc32c_csum_stub));
3273        *(__le32 *)(skb->data + offset) = crc32c_csum;
3274        skb->ip_summed = CHECKSUM_NONE;
3275        skb->csum_not_inet = 0;
3276out:
3277        return ret;
3278}
3279
3280__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3281{
3282        __be16 type = skb->protocol;
3283
3284        /* Tunnel gso handlers can set protocol to ethernet. */
3285        if (type == htons(ETH_P_TEB)) {
3286                struct ethhdr *eth;
3287
3288                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3289                        return 0;
3290
3291                eth = (struct ethhdr *)skb->data;
3292                type = eth->h_proto;
3293        }
3294
3295        return __vlan_get_protocol(skb, type, depth);
3296}
3297
3298/**
3299 *      skb_mac_gso_segment - mac layer segmentation handler.
3300 *      @skb: buffer to segment
3301 *      @features: features for the output path (see dev->features)
3302 */
3303struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
3304                                    netdev_features_t features)
3305{
3306        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
3307        struct packet_offload *ptype;
3308        int vlan_depth = skb->mac_len;
3309        __be16 type = skb_network_protocol(skb, &vlan_depth);
3310
3311        if (unlikely(!type))
3312                return ERR_PTR(-EINVAL);
3313
3314        __skb_pull(skb, vlan_depth);
3315
3316        rcu_read_lock();
3317        list_for_each_entry_rcu(ptype, &offload_base, list) {
3318                if (ptype->type == type && ptype->callbacks.gso_segment) {
3319                        segs = ptype->callbacks.gso_segment(skb, features);
3320                        break;
3321                }
3322        }
3323        rcu_read_unlock();
3324
3325        __skb_push(skb, skb->data - skb_mac_header(skb));
3326
3327        return segs;
3328}
3329EXPORT_SYMBOL(skb_mac_gso_segment);
3330
3331
3332/* openvswitch calls this on rx path, so we need a different check.
3333 */
3334static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
3335{
3336        if (tx_path)
3337                return skb->ip_summed != CHECKSUM_PARTIAL &&
3338                       skb->ip_summed != CHECKSUM_UNNECESSARY;
3339
3340        return skb->ip_summed == CHECKSUM_NONE;
3341}
3342
3343/**
3344 *      __skb_gso_segment - Perform segmentation on skb.
3345 *      @skb: buffer to segment
3346 *      @features: features for the output path (see dev->features)
3347 *      @tx_path: whether it is called in TX path
3348 *
3349 *      This function segments the given skb and returns a list of segments.
3350 *
3351 *      It may return NULL if the skb requires no segmentation.  This is
3352 *      only possible when GSO is used for verifying header integrity.
3353 *
3354 *      Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
3355 */
3356struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
3357                                  netdev_features_t features, bool tx_path)
3358{
3359        struct sk_buff *segs;
3360
3361        if (unlikely(skb_needs_check(skb, tx_path))) {
3362                int err;
3363
3364                /* We're going to init ->check field in TCP or UDP header */
3365                err = skb_cow_head(skb, 0);
3366                if (err < 0)
3367                        return ERR_PTR(err);
3368        }
3369
3370        /* Only report GSO partial support if it will enable us to
3371         * support segmentation on this frame without needing additional
3372         * work.
3373         */
3374        if (features & NETIF_F_GSO_PARTIAL) {
3375                netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3376                struct net_device *dev = skb->dev;
3377
3378                partial_features |= dev->features & dev->gso_partial_features;
3379                if (!skb_gso_ok(skb, features | partial_features))
3380                        features &= ~NETIF_F_GSO_PARTIAL;
3381        }
3382
3383        BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
3384                     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3385
3386        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3387        SKB_GSO_CB(skb)->encap_level = 0;
3388
3389        skb_reset_mac_header(skb);
3390        skb_reset_mac_len(skb);
3391
3392        segs = skb_mac_gso_segment(skb, features);
3393
3394        if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3395                skb_warn_bad_offload(skb);
3396
3397        return segs;
3398}
3399EXPORT_SYMBOL(__skb_gso_segment);
3400
3401/* Take action when hardware reception checksum errors are detected. */
3402#ifdef CONFIG_BUG
3403void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3404{
3405        if (net_ratelimit()) {
3406                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3407                skb_dump(KERN_ERR, skb, true);
3408                dump_stack();
3409        }
3410}
3411EXPORT_SYMBOL(netdev_rx_csum_fault);
3412#endif
3413
3414/* XXX: check that highmem exists at all on the given machine. */
3415static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3416{
3417#ifdef CONFIG_HIGHMEM
3418        int i;
3419
3420        if (!(dev->features & NETIF_F_HIGHDMA)) {
3421                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3422                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3423
3424                        if (PageHighMem(skb_frag_page(frag)))
3425                                return 1;
3426                }
3427        }
3428#endif
3429        return 0;
3430}
3431
3432/* If MPLS offload request, verify we are testing hardware MPLS features
3433 * instead of standard features for the netdev.
3434 */
3435#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3436static netdev_features_t net_mpls_features(struct sk_buff *skb,
3437                                           netdev_features_t features,
3438                                           __be16 type)
3439{
3440        if (eth_p_mpls(type))
3441                features &= skb->dev->mpls_features;
3442
3443        return features;
3444}
3445#else
3446static netdev_features_t net_mpls_features(struct sk_buff *skb,
3447                                           netdev_features_t features,
3448                                           __be16 type)
3449{
3450        return features;
3451}
3452#endif
3453
3454static netdev_features_t harmonize_features(struct sk_buff *skb,
3455        netdev_features_t features)
3456{
3457        __be16 type;
3458
3459        type = skb_network_protocol(skb, NULL);
3460        features = net_mpls_features(skb, features, type);
3461
3462        if (skb->ip_summed != CHECKSUM_NONE &&
3463            !can_checksum_protocol(features, type)) {
3464                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3465        }
3466        if (illegal_highdma(skb->dev, skb))
3467                features &= ~NETIF_F_SG;
3468
3469        return features;
3470}
3471
3472netdev_features_t passthru_features_check(struct sk_buff *skb,
3473                                          struct net_device *dev,
3474                                          netdev_features_t features)
3475{
3476        return features;
3477}
3478EXPORT_SYMBOL(passthru_features_check);
3479
3480static netdev_features_t dflt_features_check(struct sk_buff *skb,
3481                                             struct net_device *dev,
3482                                             netdev_features_t features)
3483{
3484        return vlan_features_check(skb, features);
3485}
3486
3487static netdev_features_t gso_features_check(const struct sk_buff *skb,
3488                                            struct net_device *dev,
3489                                            netdev_features_t features)
3490{
3491        u16 gso_segs = skb_shinfo(skb)->gso_segs;
3492
3493        if (gso_segs > dev->gso_max_segs)
3494                return features & ~NETIF_F_GSO_MASK;
3495
3496        /* Support for GSO partial features requires software
3497         * intervention before we can actually process the packets
3498         * so we need to strip support for any partial features now
3499         * and we can pull them back in after we have partially
3500         * segmented the frame.
3501         */
3502        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3503                features &= ~dev->gso_partial_features;
3504
3505        /* Make sure to clear the IPv4 ID mangling feature if the
3506         * IPv4 header has the potential to be fragmented.
3507         */
3508        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3509                struct iphdr *iph = skb->encapsulation ?
3510                                    inner_ip_hdr(skb) : ip_hdr(skb);
3511
3512                if (!(iph->frag_off & htons(IP_DF)))
3513                        features &= ~NETIF_F_TSO_MANGLEID;
3514        }
3515
3516        return features;
3517}
3518
3519netdev_features_t netif_skb_features(struct sk_buff *skb)
3520{
3521        struct net_device *dev = skb->dev;
3522        netdev_features_t features = dev->features;
3523
3524        if (skb_is_gso(skb))
3525                features = gso_features_check(skb, dev, features);
3526
3527        /* If encapsulation offload request, verify we are testing
3528         * hardware encapsulation features instead of standard
3529         * features for the netdev
3530         */
3531        if (skb->encapsulation)
3532                features &= dev->hw_enc_features;
3533
3534        if (skb_vlan_tagged(skb))
3535                features = netdev_intersect_features(features,
3536                                                     dev->vlan_features |
3537                                                     NETIF_F_HW_VLAN_CTAG_TX |
3538                                                     NETIF_F_HW_VLAN_STAG_TX);
3539
3540        if (dev->netdev_ops->ndo_features_check)
3541                features &= dev->netdev_ops->ndo_features_check(skb, dev,
3542                                                                features);
3543        else
3544                features &= dflt_features_check(skb, dev, features);
3545
3546        return harmonize_features(skb, features);
3547}
3548EXPORT_SYMBOL(netif_skb_features);
3549
3550static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3551                    struct netdev_queue *txq, bool more)
3552{
3553        unsigned int len;
3554        int rc;
3555
3556        if (dev_nit_active(dev))
3557                dev_queue_xmit_nit(skb, dev);
3558
3559        len = skb->len;
3560        trace_net_dev_start_xmit(skb, dev);
3561        rc = netdev_start_xmit(skb, dev, txq, more);
3562        trace_net_dev_xmit(skb, rc, dev, len);
3563
3564        return rc;
3565}
3566
3567struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3568                                    struct netdev_queue *txq, int *ret)
3569{
3570        struct sk_buff *skb = first;
3571        int rc = NETDEV_TX_OK;
3572
3573        while (skb) {
3574                struct sk_buff *next = skb->next;
3575
3576                skb_mark_not_on_list(skb);
3577                rc = xmit_one(skb, dev, txq, next != NULL);
3578                if (unlikely(!dev_xmit_complete(rc))) {
3579                        skb->next = next;
3580                        goto out;
3581                }
3582
3583                skb = next;
3584                if (netif_tx_queue_stopped(txq) && skb) {
3585                        rc = NETDEV_TX_BUSY;
3586                        break;
3587                }
3588        }
3589
3590out:
3591        *ret = rc;
3592        return skb;
3593}
3594
3595static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3596                                          netdev_features_t features)
3597{
3598        if (skb_vlan_tag_present(skb) &&
3599            !vlan_hw_offload_capable(features, skb->vlan_proto))
3600                skb = __vlan_hwaccel_push_inside(skb);
3601        return skb;
3602}
3603
3604int skb_csum_hwoffload_help(struct sk_buff *skb,
3605                            const netdev_features_t features)
3606{
3607        if (unlikely(skb->csum_not_inet))
3608                return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3609                        skb_crc32c_csum_help(skb);
3610
3611        return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3612}
3613EXPORT_SYMBOL(skb_csum_hwoffload_help);
3614
3615static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3616{
3617        netdev_features_t features;
3618
3619        features = netif_skb_features(skb);
3620        skb = validate_xmit_vlan(skb, features);
3621        if (unlikely(!skb))
3622                goto out_null;
3623
3624        skb = sk_validate_xmit_skb(skb, dev);
3625        if (unlikely(!skb))
3626                goto out_null;
3627
3628        if (netif_needs_gso(skb, features)) {
3629                struct sk_buff *segs;
3630
3631                segs = skb_gso_segment(skb, features);
3632                if (IS_ERR(segs)) {
3633                        goto out_kfree_skb;
3634                } else if (segs) {
3635                        consume_skb(skb);
3636                        skb = segs;
3637                }
3638        } else {
3639                if (skb_needs_linearize(skb, features) &&
3640                    __skb_linearize(skb))
3641                        goto out_kfree_skb;
3642
3643                /* If packet is not checksummed and device does not
3644                 * support checksumming for this protocol, complete
3645                 * checksumming here.
3646                 */
3647                if (skb->ip_summed == CHECKSUM_PARTIAL) {
3648                        if (skb->encapsulation)
3649                                skb_set_inner_transport_header(skb,
3650                                                               skb_checksum_start_offset(skb));
3651                        else
3652                                skb_set_transport_header(skb,
3653                                                         skb_checksum_start_offset(skb));
3654                        if (skb_csum_hwoffload_help(skb, features))
3655                                goto out_kfree_skb;
3656                }
3657        }
3658
3659        skb = validate_xmit_xfrm(skb, features, again);
3660
3661        return skb;
3662
3663out_kfree_skb:
3664        kfree_skb(skb);
3665out_null:
3666        atomic_long_inc(&dev->tx_dropped);
3667        return NULL;
3668}
3669
3670struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3671{
3672        struct sk_buff *next, *head = NULL, *tail;
3673
3674        for (; skb != NULL; skb = next) {
3675                next = skb->next;
3676                skb_mark_not_on_list(skb);
3677
3678                /* in case skb wont be segmented, point to itself */
3679                skb->prev = skb;
3680
3681                skb = validate_xmit_skb(skb, dev, again);
3682                if (!skb)
3683                        continue;
3684
3685                if (!head)
3686                        head = skb;
3687                else
3688                        tail->next = skb;
3689                /* If skb was segmented, skb->prev points to
3690                 * the last segment. If not, it still contains skb.
3691                 */
3692                tail = skb->prev;
3693        }
3694        return head;
3695}
3696EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3697
3698static void qdisc_pkt_len_init(struct sk_buff *skb)
3699{
3700        const struct skb_shared_info *shinfo = skb_shinfo(skb);
3701
3702        qdisc_skb_cb(skb)->pkt_len = skb->len;
3703
3704        /* To get more precise estimation of bytes sent on wire,
3705         * we add to pkt_len the headers size of all segments
3706         */
3707        if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3708                unsigned int hdr_len;
3709                u16 gso_segs = shinfo->gso_segs;
3710
3711                /* mac layer + network layer */
3712                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3713
3714                /* + transport layer */
3715                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3716                        const struct tcphdr *th;
3717                        struct tcphdr _tcphdr;
3718
3719                        th = skb_header_pointer(skb, skb_transport_offset(skb),
3720                                                sizeof(_tcphdr), &_tcphdr);
3721                        if (likely(th))
3722                                hdr_len += __tcp_hdrlen(th);
3723                } else {
3724                        struct udphdr _udphdr;
3725
3726                        if (skb_header_pointer(skb, skb_transport_offset(skb),
3727                                               sizeof(_udphdr), &_udphdr))
3728                                hdr_len += sizeof(struct udphdr);
3729                }
3730
3731                if (shinfo->gso_type & SKB_GSO_DODGY)
3732                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3733                                                shinfo->gso_size);
3734
3735                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3736        }
3737}
3738
3739static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3740                                 struct net_device *dev,
3741                                 struct netdev_queue *txq)
3742{
3743        spinlock_t *root_lock = qdisc_lock(q);
3744        struct sk_buff *to_free = NULL;
3745        bool contended;
3746        int rc;
3747
3748        qdisc_calculate_pkt_len(skb, q);
3749
3750        if (q->flags & TCQ_F_NOLOCK) {
3751                rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3752                qdisc_run(q);
3753
3754                if (unlikely(to_free))
3755                        kfree_skb_list(to_free);
3756                return rc;
3757        }
3758
3759        /*
3760         * Heuristic to force contended enqueues to serialize on a
3761         * separate lock before trying to get qdisc main lock.
3762         * This permits qdisc->running owner to get the lock more
3763         * often and dequeue packets faster.
3764         */
3765        contended = qdisc_is_running(q);
3766        if (unlikely(contended))
3767                spin_lock(&q->busylock);
3768
3769        spin_lock(root_lock);
3770        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3771                __qdisc_drop(skb, &to_free);
3772                rc = NET_XMIT_DROP;
3773        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3774                   qdisc_run_begin(q)) {
3775                /*
3776                 * This is a work-conserving queue; there are no old skbs
3777                 * waiting to be sent out; and the qdisc is not running -
3778                 * xmit the skb directly.
3779                 */
3780
3781                qdisc_bstats_update(q, skb);
3782
3783                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3784                        if (unlikely(contended)) {
3785                                spin_unlock(&q->busylock);
3786                                contended = false;
3787                        }
3788                        __qdisc_run(q);
3789                }
3790
3791                qdisc_run_end(q);
3792                rc = NET_XMIT_SUCCESS;
3793        } else {
3794                rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3795                if (qdisc_run_begin(q)) {
3796                        if (unlikely(contended)) {
3797                                spin_unlock(&q->busylock);
3798                                contended = false;
3799                        }
3800                        __qdisc_run(q);
3801                        qdisc_run_end(q);
3802                }
3803        }
3804        spin_unlock(root_lock);
3805        if (unlikely(to_free))
3806                kfree_skb_list(to_free);
3807        if (unlikely(contended))
3808                spin_unlock(&q->busylock);
3809        return rc;
3810}
3811
3812#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3813static void skb_update_prio(struct sk_buff *skb)
3814{
3815        const struct netprio_map *map;
3816        const struct sock *sk;
3817        unsigned int prioidx;
3818
3819        if (skb->priority)
3820                return;
3821        map = rcu_dereference_bh(skb->dev->priomap);
3822        if (!map)
3823                return;
3824        sk = skb_to_full_sk(skb);
3825        if (!sk)
3826                return;
3827
3828        prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3829
3830        if (prioidx < map->priomap_len)
3831                skb->priority = map->priomap[prioidx];
3832}
3833#else
3834#define skb_update_prio(skb)
3835#endif
3836
3837/**
3838 *      dev_loopback_xmit - loop back @skb
3839 *      @net: network namespace this loopback is happening in
3840 *      @sk:  sk needed to be a netfilter okfn
3841 *      @skb: buffer to transmit
3842 */
3843int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3844{
3845        skb_reset_mac_header(skb);
3846        __skb_pull(skb, skb_network_offset(skb));
3847        skb->pkt_type = PACKET_LOOPBACK;
3848        skb->ip_summed = CHECKSUM_UNNECESSARY;
3849        WARN_ON(!skb_dst(skb));
3850        skb_dst_force(skb);
3851        netif_rx_ni(skb);
3852        return 0;
3853}
3854EXPORT_SYMBOL(dev_loopback_xmit);
3855
3856#ifdef CONFIG_NET_EGRESS
3857static struct sk_buff *
3858sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3859{
3860        struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3861        struct tcf_result cl_res;
3862
3863        if (!miniq)
3864                return skb;
3865
3866        /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3867        mini_qdisc_bstats_cpu_update(miniq, skb);
3868
3869        switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3870        case TC_ACT_OK:
3871        case TC_ACT_RECLASSIFY:
3872                skb->tc_index = TC_H_MIN(cl_res.classid);
3873                break;
3874        case TC_ACT_SHOT:
3875                mini_qdisc_qstats_cpu_drop(miniq);
3876                *ret = NET_XMIT_DROP;
3877                kfree_skb(skb);
3878                return NULL;
3879        case TC_ACT_STOLEN:
3880        case TC_ACT_QUEUED:
3881        case TC_ACT_TRAP:
3882                *ret = NET_XMIT_SUCCESS;
3883                consume_skb(skb);
3884                return NULL;
3885        case TC_ACT_REDIRECT:
3886                /* No need to push/pop skb's mac_header here on egress! */
3887                skb_do_redirect(skb);
3888                *ret = NET_XMIT_SUCCESS;
3889                return NULL;
3890        default:
3891                break;
3892        }
3893
3894        return skb;
3895}
3896#endif /* CONFIG_NET_EGRESS */
3897
3898#ifdef CONFIG_XPS
3899static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3900                               struct xps_dev_maps *dev_maps, unsigned int tci)
3901{
3902        struct xps_map *map;
3903        int queue_index = -1;
3904
3905        if (dev->num_tc) {
3906                tci *= dev->num_tc;
3907                tci += netdev_get_prio_tc_map(dev, skb->priority);
3908        }
3909
3910        map = rcu_dereference(dev_maps->attr_map[tci]);
3911        if (map) {
3912                if (map->len == 1)
3913                        queue_index = map->queues[0];
3914                else
3915                        queue_index = map->queues[reciprocal_scale(
3916                                                skb_get_hash(skb), map->len)];
3917                if (unlikely(queue_index >= dev->real_num_tx_queues))
3918                        queue_index = -1;
3919        }
3920        return queue_index;
3921}
3922#endif
3923
3924static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3925                         struct sk_buff *skb)
3926{
3927#ifdef CONFIG_XPS
3928        struct xps_dev_maps *dev_maps;
3929        struct sock *sk = skb->sk;
3930        int queue_index = -1;
3931
3932        if (!static_key_false(&xps_needed))
3933                return -1;
3934
3935        rcu_read_lock();
3936        if (!static_key_false(&xps_rxqs_needed))
3937                goto get_cpus_map;
3938
3939        dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
3940        if (dev_maps) {
3941                int tci = sk_rx_queue_get(sk);
3942
3943                if (tci >= 0 && tci < dev->num_rx_queues)
3944                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3945                                                          tci);
3946        }
3947
3948get_cpus_map:
3949        if (queue_index < 0) {
3950                dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
3951                if (dev_maps) {
3952                        unsigned int tci = skb->sender_cpu - 1;
3953
3954                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3955                                                          tci);
3956                }
3957        }
3958        rcu_read_unlock();
3959
3960        return queue_index;
3961#else
3962        return -1;
3963#endif
3964}
3965
3966u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
3967                     struct net_device *sb_dev)
3968{
3969        return 0;
3970}
3971EXPORT_SYMBOL(dev_pick_tx_zero);
3972
3973u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
3974                       struct net_device *sb_dev)
3975{
3976        return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
3977}
3978EXPORT_SYMBOL(dev_pick_tx_cpu_id);
3979
3980u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
3981                     struct net_device *sb_dev)
3982{
3983        struct sock *sk = skb->sk;
3984        int queue_index = sk_tx_queue_get(sk);
3985
3986        sb_dev = sb_dev ? : dev;
3987
3988        if (queue_index < 0 || skb->ooo_okay ||
3989            queue_index >= dev->real_num_tx_queues) {
3990                int new_index = get_xps_queue(dev, sb_dev, skb);
3991
3992                if (new_index < 0)
3993                        new_index = skb_tx_hash(dev, sb_dev, skb);
3994
3995                if (queue_index != new_index && sk &&
3996                    sk_fullsock(sk) &&
3997                    rcu_access_pointer(sk->sk_dst_cache))
3998                        sk_tx_queue_set(sk, new_index);
3999
4000                queue_index = new_index;

4001        }
4002
4003        return queue_index;
4004}
4005EXPORT_SYMBOL(netdev_pick_tx);
4006
4007struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4008                                         struct sk_buff *skb,
4009                                         struct net_device *sb_dev)
4010{
4011        int queue_index = 0;
4012
4013#ifdef CONFIG_XPS
4014        u32 sender_cpu = skb->sender_cpu - 1;
4015
4016        if (sender_cpu >= (u32)NR_CPUS)
4017                skb->sender_cpu = raw_smp_processor_id() + 1;
4018#endif
4019
4020        if (dev->real_num_tx_queues != 1) {
4021                const struct net_device_ops *ops = dev->netdev_ops;
4022
4023                if (ops->ndo_select_queue)
4024                        queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4025                else
4026                        queue_index = netdev_pick_tx(dev, skb, sb_dev);
4027
4028                queue_index = netdev_cap_txqueue(dev, queue_index);
4029        }
4030
4031        skb_set_queue_mapping(skb, queue_index);
4032        return netdev_get_tx_queue(dev, queue_index);
4033}
4034
4035/**
4036 *      __dev_queue_xmit - transmit a buffer
4037 *      @skb: buffer to transmit
4038 *      @sb_dev: suboordinate device used for L2 forwarding offload
4039 *
4040 *      Queue a buffer for transmission to a network device. The caller must
4041 *      have set the device and priority and built the buffer before calling
4042 *      this function. The function can be called from an interrupt.
4043 *
4044 *      A negative errno code is returned on a failure. A success does not
4045 *      guarantee the frame will be transmitted as it may be dropped due
4046 *      to congestion or traffic shaping.
4047 *
4048 * -----------------------------------------------------------------------------------
4049 *      I notice this method can also return errors from the queue disciplines,
4050 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
4051 *      be positive.
4052 *
4053 *      Regardless of the return value, the skb is consumed, so it is currently
4054 *      difficult to retry a send to this method.  (You can bump the ref count
4055 *      before sending to hold a reference for retry if you are careful.)
4056 *
4057 *      When calling this method, interrupts MUST be enabled.  This is because
4058 *      the BH enable code must have IRQs enabled so that it will not deadlock.
4059 *          --BLG
4060 */
4061static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4062{
4063        struct net_device *dev = skb->dev;
4064        struct netdev_queue *txq;
4065        struct Qdisc *q;
4066        int rc = -ENOMEM;
4067        bool again = false;
4068
4069        skb_reset_mac_header(skb);
4070
4071        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4072                __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
4073
4074        /* Disable soft irqs for various locks below. Also
4075         * stops preemption for RCU.
4076         */
4077        rcu_read_lock_bh();
4078
4079        skb_update_prio(skb);
4080
4081        qdisc_pkt_len_init(skb);
4082#ifdef CONFIG_NET_CLS_ACT
4083        skb->tc_at_ingress = 0;
4084# ifdef CONFIG_NET_EGRESS
4085        if (static_branch_unlikely(&egress_needed_key)) {
4086                skb = sch_handle_egress(skb, &rc, dev);
4087                if (!skb)
4088                        goto out;
4089        }
4090# endif
4091#endif
4092        /* If device/qdisc don't need skb->dst, release it right now while
4093         * its hot in this cpu cache.
4094         */
4095        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4096                skb_dst_drop(skb);
4097        else
4098                skb_dst_force(skb);
4099
4100        txq = netdev_core_pick_tx(dev, skb, sb_dev);
4101        q = rcu_dereference_bh(txq->qdisc);
4102
4103        trace_net_dev_queue(skb);
4104        if (q->enqueue) {
4105                rc = __dev_xmit_skb(skb, q, dev, txq);
4106                goto out;
4107        }
4108
4109        /* The device has no queue. Common case for software devices:
4110         * loopback, all the sorts of tunnels...
4111
4112         * Really, it is unlikely that netif_tx_lock protection is necessary
4113         * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
4114         * counters.)
4115         * However, it is possible, that they rely on protection
4116         * made by us here.
4117
4118         * Check this and shot the lock. It is not prone from deadlocks.
4119         *Either shot noqueue qdisc, it is even simpler 8)
4120         */
4121        if (dev->flags & IFF_UP) {
4122                int cpu = smp_processor_id(); /* ok because BHs are off */
4123
4124                if (txq->xmit_lock_owner != cpu) {
4125                        if (dev_xmit_recursion())
4126                                goto recursion_alert;
4127
4128                        skb = validate_xmit_skb(skb, dev, &again);
4129                        if (!skb)
4130                                goto out;
4131
4132                        HARD_TX_LOCK(dev, txq, cpu);
4133
4134                        if (!netif_xmit_stopped(txq)) {
4135                                dev_xmit_recursion_inc();
4136                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4137                                dev_xmit_recursion_dec();
4138                                if (dev_xmit_complete(rc)) {
4139                                        HARD_TX_UNLOCK(dev, txq);
4140                                        goto out;
4141                                }
4142                        }
4143                        HARD_TX_UNLOCK(dev, txq);
4144                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4145                                             dev->name);
4146                } else {
4147                        /* Recursion is detected! It is possible,
4148                         * unfortunately
4149                         */
4150recursion_alert:
4151                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4152                                             dev->name);
4153                }
4154        }
4155
4156        rc = -ENETDOWN;
4157        rcu_read_unlock_bh();
4158
4159        atomic_long_inc(&dev->tx_dropped);
4160        kfree_skb_list(skb);
4161        return rc;
4162out:
4163        rcu_read_unlock_bh();
4164        return rc;
4165}
4166
4167int dev_queue_xmit(struct sk_buff *skb)
4168{
4169        return __dev_queue_xmit(skb, NULL);
4170}
4171EXPORT_SYMBOL(dev_queue_xmit);
4172
4173int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
4174{
4175        return __dev_queue_xmit(skb, sb_dev);
4176}
4177EXPORT_SYMBOL(dev_queue_xmit_accel);
4178
4179int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4180{
4181        struct net_device *dev = skb->dev;
4182        struct sk_buff *orig_skb = skb;
4183        struct netdev_queue *txq;
4184        int ret = NETDEV_TX_BUSY;
4185        bool again = false;
4186
4187        if (unlikely(!netif_running(dev) ||
4188                     !netif_carrier_ok(dev)))
4189                goto drop;
4190
4191        skb = validate_xmit_skb_list(skb, dev, &again);
4192        if (skb != orig_skb)
4193                goto drop;
4194
4195        skb_set_queue_mapping(skb, queue_id);
4196        txq = skb_get_tx_queue(dev, skb);
4197
4198        local_bh_disable();
4199
4200        dev_xmit_recursion_inc();
4201        HARD_TX_LOCK(dev, txq, smp_processor_id());
4202        if (!netif_xmit_frozen_or_drv_stopped(txq))
4203                ret = netdev_start_xmit(skb, dev, txq, false);
4204        HARD_TX_UNLOCK(dev, txq);
4205        dev_xmit_recursion_dec();
4206
4207        local_bh_enable();
4208
4209        if (!dev_xmit_complete(ret))
4210                kfree_skb(skb);
4211
4212        return ret;
4213drop:
4214        atomic_long_inc(&dev->tx_dropped);
4215        kfree_skb_list(skb);
4216        return NET_XMIT_DROP;
4217}
4218EXPORT_SYMBOL(dev_direct_xmit);
4219
4220/*************************************************************************
4221 *                      Receiver routines
4222 *************************************************************************/
4223
4224int netdev_max_backlog __read_mostly = 1000;
4225EXPORT_SYMBOL(netdev_max_backlog);
4226
4227int netdev_tstamp_prequeue __read_mostly = 1;
4228int netdev_budget __read_mostly = 300;
4229/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
4230unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
4231int weight_p __read_mostly = 64;           /* old backlog weight */
4232int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
4233int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
4234int dev_rx_weight __read_mostly = 64;
4235int dev_tx_weight __read_mostly = 64;
4236/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
4237int gro_normal_batch __read_mostly = 8;
4238
4239/* Called with irq disabled */
4240static inline void ____napi_schedule(struct softnet_data *sd,
4241                                     struct napi_struct *napi)
4242{
4243        list_add_tail(&napi->poll_list, &sd->poll_list);
4244        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4245}
4246
4247#ifdef CONFIG_RPS
4248
4249/* One global table that all flow-based protocols share. */
4250struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
4251EXPORT_SYMBOL(rps_sock_flow_table);
4252u32 rps_cpu_mask __read_mostly;
4253EXPORT_SYMBOL(rps_cpu_mask);
4254
4255struct static_key_false rps_needed __read_mostly;
4256EXPORT_SYMBOL(rps_needed);
4257struct static_key_false rfs_needed __read_mostly;
4258EXPORT_SYMBOL(rfs_needed);
4259
4260static struct rps_dev_flow *
4261set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4262            struct rps_dev_flow *rflow, u16 next_cpu)
4263{
4264        if (next_cpu < nr_cpu_ids) {
4265#ifdef CONFIG_RFS_ACCEL
4266                struct netdev_rx_queue *rxqueue;
4267                struct rps_dev_flow_table *flow_table;
4268                struct rps_dev_flow *old_rflow;
4269                u32 flow_id;
4270                u16 rxq_index;
4271                int rc;
4272
4273                /* Should we steer this flow to a different hardware queue? */
4274                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4275                    !(dev->features & NETIF_F_NTUPLE))
4276                        goto out;
4277                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4278                if (rxq_index == skb_get_rx_queue(skb))
4279                        goto out;
4280
4281                rxqueue = dev->_rx + rxq_index;
4282                flow_table = rcu_dereference(rxqueue->rps_flow_table);
4283                if (!flow_table)
4284                        goto out;
4285                flow_id = skb_get_hash(skb) & flow_table->mask;
4286                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4287                                                        rxq_index, flow_id);
4288                if (rc < 0)
4289                        goto out;
4290                old_rflow = rflow;
4291                rflow = &flow_table->flows[flow_id];
4292                rflow->filter = rc;
4293                if (old_rflow->filter == rflow->filter)
4294                        old_rflow->filter = RPS_NO_FILTER;
4295        out:
4296#endif
4297                rflow->last_qtail =
4298                        per_cpu(softnet_data, next_cpu).input_queue_head;
4299        }
4300
4301        rflow->cpu = next_cpu;
4302        return rflow;
4303}
4304
4305/*
4306 * get_rps_cpu is called from netif_receive_skb and returns the target
4307 * CPU from the RPS map of the receiving queue for a given skb.
4308 * rcu_read_lock must be held on entry.
4309 */
4310static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4311                       struct rps_dev_flow **rflowp)
4312{
4313        const struct rps_sock_flow_table *sock_flow_table;
4314        struct netdev_rx_queue *rxqueue = dev->_rx;
4315        struct rps_dev_flow_table *flow_table;
4316        struct rps_map *map;
4317        int cpu = -1;
4318        u32 tcpu;
4319        u32 hash;
4320
4321        if (skb_rx_queue_recorded(skb)) {
4322                u16 index = skb_get_rx_queue(skb);
4323
4324                if (unlikely(index >= dev->real_num_rx_queues)) {
4325                        WARN_ONCE(dev->real_num_rx_queues > 1,
4326                                  "%s received packet on queue %u, but number "
4327                                  "of RX queues is %u\n",
4328                                  dev->name, index, dev->real_num_rx_queues);
4329                        goto done;
4330                }
4331                rxqueue += index;
4332        }
4333
4334        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4335
4336        flow_table = rcu_dereference(rxqueue->rps_flow_table);
4337        map = rcu_dereference(rxqueue->rps_map);
4338        if (!flow_table && !map)
4339                goto done;
4340
4341        skb_reset_network_header(skb);
4342        hash = skb_get_hash(skb);
4343        if (!hash)
4344                goto done;
4345
4346        sock_flow_table = rcu_dereference(rps_sock_flow_table);
4347        if (flow_table && sock_flow_table) {
4348                struct rps_dev_flow *rflow;
4349                u32 next_cpu;
4350                u32 ident;
4351
4352                /* First check into global flow table if there is a match */
4353                ident = sock_flow_table->ents[hash & sock_flow_table->mask];
4354                if ((ident ^ hash) & ~rps_cpu_mask)
4355                        goto try_rps;
4356
4357                next_cpu = ident & rps_cpu_mask;
4358
4359                /* OK, now we know there is a match,
4360                 * we can look at the local (per receive queue) flow table
4361                 */
4362                rflow = &flow_table->flows[hash & flow_table->mask];
4363                tcpu = rflow->cpu;
4364
4365                /*
4366                 * If the desired CPU (where last recvmsg was done) is
4367                 * different from current CPU (one in the rx-queue flow
4368                 * table entry), switch if one of the following holds:
4369                 *   - Current CPU is unset (>= nr_cpu_ids).
4370                 *   - Current CPU is offline.
4371                 *   - The current CPU's queue tail has advanced beyond the
4372                 *     last packet that was enqueued using this table entry.
4373                 *     This guarantees that all previous packets for the flow
4374                 *     have been dequeued, thus preserving in order delivery.
4375                 */
4376                if (unlikely(tcpu != next_cpu) &&
4377                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4378                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4379                      rflow->last_qtail)) >= 0)) {
4380                        tcpu = next_cpu;
4381                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4382                }
4383
4384                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4385                        *rflowp = rflow;
4386                        cpu = tcpu;
4387                        goto done;
4388                }
4389        }
4390
4391try_rps:
4392
4393        if (map) {
4394                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4395                if (cpu_online(tcpu)) {
4396                        cpu = tcpu;
4397                        goto done;
4398                }
4399        }
4400
4401done:
4402        return cpu;
4403}
4404
4405#ifdef CONFIG_RFS_ACCEL
4406
4407/**
4408 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4409 * @dev: Device on which the filter was set
4410 * @rxq_index: RX queue index
4411 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4412 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4413 *
4414 * Drivers that implement ndo_rx_flow_steer() should periodically call
4415 * this function for each installed filter and remove the filters for
4416 * which it returns %true.
4417 */
4418bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4419                         u32 flow_id, u16 filter_id)
4420{
4421        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4422        struct rps_dev_flow_table *flow_table;
4423        struct rps_dev_flow *rflow;
4424        bool expire = true;
4425        unsigned int cpu;
4426
4427        rcu_read_lock();
4428        flow_table = rcu_dereference(rxqueue->rps_flow_table);
4429        if (flow_table && flow_id <= flow_table->mask) {
4430                rflow = &flow_table->flows[flow_id];
4431                cpu = READ_ONCE(rflow->cpu);
4432                if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4433                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4434                           rflow->last_qtail) <
4435                     (int)(10 * flow_table->mask)))
4436                        expire = false;
4437        }
4438        rcu_read_unlock();
4439        return expire;
4440}
4441EXPORT_SYMBOL(rps_may_expire_flow);
4442
4443#endif /* CONFIG_RFS_ACCEL */
4444
4445/* Called from hardirq (IPI) context */
4446static void rps_trigger_softirq(void *data)
4447{
4448        struct softnet_data *sd = data;
4449
4450        ____napi_schedule(sd, &sd->backlog);
4451        sd->received_rps++;
4452}
4453
4454#endif /* CONFIG_RPS */
4455
4456/*
4457 * Check if this softnet_data structure is another cpu one
4458 * If yes, queue it to our IPI list and return 1
4459 * If no, return 0
4460 */
4461static int rps_ipi_queued(struct softnet_data *sd)
4462{
4463#ifdef CONFIG_RPS
4464        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4465
4466        if (sd != mysd) {
4467                sd->rps_ipi_next = mysd->rps_ipi_list;
4468                mysd->rps_ipi_list = sd;
4469
4470                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4471                return 1;
4472        }
4473#endif /* CONFIG_RPS */
4474        return 0;
4475}
4476
4477#ifdef CONFIG_NET_FLOW_LIMIT
4478int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4479#endif
4480
4481static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4482{
4483#ifdef CONFIG_NET_FLOW_LIMIT
4484        struct sd_flow_limit *fl;
4485        struct softnet_data *sd;
4486        unsigned int old_flow, new_flow;
4487
4488        if (qlen < (netdev_max_backlog >> 1))
4489                return false;
4490
4491        sd = this_cpu_ptr(&softnet_data);
4492
4493        rcu_read_lock();
4494        fl = rcu_dereference(sd->flow_limit);
4495        if (fl) {
4496                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4497                old_flow = fl->history[fl->history_head];
4498                fl->history[fl->history_head] = new_flow;
4499
4500                fl->history_head++;
4501                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4502
4503                if (likely(fl->buckets[old_flow]))
4504                        fl->buckets[old_flow]--;
4505
4506                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4507                        fl->count++;
4508                        rcu_read_unlock();
4509                        return true;
4510                }
4511        }
4512        rcu_read_unlock();
4513#endif
4514        return false;
4515}
4516
4517/*
4518 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4519 * queue (may be a remote CPU queue).
4520 */
4521static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4522                              unsigned int *qtail)
4523{
4524        struct softnet_data *sd;
4525        unsigned long flags;
4526        unsigned int qlen;
4527
4528        sd = &per_cpu(softnet_data, cpu);
4529
4530        local_irq_save(flags);
4531
4532        rps_lock(sd);
4533        if (!netif_running(skb->dev))
4534                goto drop;
4535        qlen = skb_queue_len(&sd->input_pkt_queue);
4536        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
4537                if (qlen) {
4538enqueue:
4539                        __skb_queue_tail(&sd->input_pkt_queue, skb);
4540                        input_queue_tail_incr_save(sd, qtail);
4541                        rps_unlock(sd);
4542                        local_irq_restore(flags);
4543                        return NET_RX_SUCCESS;
4544                }
4545
4546                /* Schedule NAPI for backlog device
4547                 * We can use non atomic operation since we own the queue lock
4548                 */
4549                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
4550                        if (!rps_ipi_queued(sd))
4551                                ____napi_schedule(sd, &sd->backlog);
4552                }
4553                goto enqueue;
4554        }
4555
4556drop:
4557        sd->dropped++;
4558        rps_unlock(sd);
4559
4560        local_irq_restore(flags);
4561
4562        atomic_long_inc(&skb->dev->rx_dropped);
4563        kfree_skb(skb);
4564        return NET_RX_DROP;
4565}
4566
4567static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4568{
4569        struct net_device *dev = skb->dev;
4570        struct netdev_rx_queue *rxqueue;
4571
4572        rxqueue = dev->_rx;
4573
4574        if (skb_rx_queue_recorded(skb)) {
4575                u16 index = skb_get_rx_queue(skb);
4576
4577                if (unlikely(index >= dev->real_num_rx_queues)) {
4578                        WARN_ONCE(dev->real_num_rx_queues > 1,
4579                                  "%s received packet on queue %u, but number "
4580                                  "of RX queues is %u\n",
4581                                  dev->name, index, dev->real_num_rx_queues);
4582
4583                        return rxqueue; /* Return first rxqueue */
4584                }
4585                rxqueue += index;
4586        }
4587        return rxqueue;
4588}
4589
4590static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4591                                     struct xdp_buff *xdp,
4592                                     struct bpf_prog *xdp_prog)
4593{
4594        struct netdev_rx_queue *rxqueue;
4595        void *orig_data, *orig_data_end;
4596        u32 metalen, act = XDP_DROP;
4597        __be16 orig_eth_type;
4598        struct ethhdr *eth;
4599        bool orig_bcast;
4600        int hlen, off;
4601        u32 mac_len;
4602
4603        /* Reinjected packets coming from act_mirred or similar should
4604         * not get XDP generic processing.
4605         */
4606        if (skb_is_redirected(skb))
4607                return XDP_PASS;
4608
4609        /* XDP packets must be linear and must have sufficient headroom
4610         * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4611         * native XDP provides, thus we need to do it here as well.
4612         */
4613        if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4614            skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4615                int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4616                int troom = skb->tail + skb->data_len - skb->end;
4617
4618                /* In case we have to go down the path and also linearize,
4619                 * then lets do the pskb_expand_head() work just once here.
4620                 */
4621                if (pskb_expand_head(skb,
4622                                     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4623                                     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4624                        goto do_drop;
4625                if (skb_linearize(skb))
4626                        goto do_drop;
4627        }
4628
4629        /* The XDP program wants to see the packet starting at the MAC
4630         * header.
4631         */
4632        mac_len = skb->data - skb_mac_header(skb);
4633        hlen = skb_headlen(skb) + mac_len;
4634        xdp->data = skb->data - mac_len;
4635        xdp->data_meta = xdp->data;
4636        xdp->data_end = xdp->data + hlen;
4637        xdp->data_hard_start = skb->data - skb_headroom(skb);
4638
4639        /* SKB "head" area always have tailroom for skb_shared_info */
4640        xdp->frame_sz  = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
4641        xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4642
4643        orig_data_end = xdp->data_end;
4644        orig_data = xdp->data;
4645        eth = (struct ethhdr *)xdp->data;
4646        orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4647        orig_eth_type = eth->h_proto;
4648
4649        rxqueue = netif_get_rxqueue(skb);
4650        xdp->rxq = &rxqueue->xdp_rxq;
4651
4652        act = bpf_prog_run_xdp(xdp_prog, xdp);
4653
4654        /* check if bpf_xdp_adjust_head was used */
4655        off = xdp->data - orig_data;
4656        if (off) {
4657                if (off > 0)
4658                        __skb_pull(skb, off);
4659                else if (off < 0)
4660                        __skb_push(skb, -off);
4661
4662                skb->mac_header += off;
4663                skb_reset_network_header(skb);
4664        }
4665
4666        /* check if bpf_xdp_adjust_tail was used */
4667        off = xdp->data_end - orig_data_end;
4668        if (off != 0) {
4669                skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4670                skb->len += off; /* positive on grow, negative on shrink */
4671        }
4672
4673        /* check if XDP changed eth hdr such SKB needs update */
4674        eth = (struct ethhdr *)xdp->data;
4675        if ((orig_eth_type != eth->h_proto) ||
4676            (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4677                __skb_push(skb, ETH_HLEN);
4678                skb->protocol = eth_type_trans(skb, skb->dev);
4679        }
4680
4681        switch (act) {
4682        case XDP_REDIRECT:
4683        case XDP_TX:
4684                __skb_push(skb, mac_len);
4685                break;
4686        case XDP_PASS:
4687                metalen = xdp->data - xdp->data_meta;
4688                if (metalen)
4689                        skb_metadata_set(skb, metalen);
4690                break;
4691        default:
4692                bpf_warn_invalid_xdp_action(act);
4693                fallthrough;
4694        case XDP_ABORTED:
4695                trace_xdp_exception(skb->dev, xdp_prog, act);
4696                fallthrough;
4697        case XDP_DROP:
4698        do_drop:
4699                kfree_skb(skb);
4700                break;
4701        }
4702
4703        return act;
4704}
4705
4706/* When doing generic XDP we have to bypass the qdisc layer and the
4707 * network taps in order to match in-driver-XDP behavior.
4708 */
4709void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4710{
4711        struct net_device *dev = skb->dev;
4712        struct netdev_queue *txq;
4713        bool free_skb = true;
4714        int cpu, rc;
4715
4716        txq = netdev_core_pick_tx(dev, skb, NULL);
4717        cpu = smp_processor_id();
4718        HARD_TX_LOCK(dev, txq, cpu);
4719        if (!netif_xmit_stopped(txq)) {
4720                rc = netdev_start_xmit(skb, dev, txq, 0);
4721                if (dev_xmit_complete(rc))
4722                        free_skb = false;
4723        }
4724        HARD_TX_UNLOCK(dev, txq);
4725        if (free_skb) {
4726                trace_xdp_exception(dev, xdp_prog, XDP_TX);
4727                kfree_skb(skb);
4728        }
4729}
4730
4731static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4732
4733int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4734{
4735        if (xdp_prog) {
4736                struct xdp_buff xdp;
4737                u32 act;
4738                int err;
4739
4740                act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4741                if (act != XDP_PASS) {
4742                        switch (act) {
4743                        case XDP_REDIRECT:
4744                                err = xdp_do_generic_redirect(skb->dev, skb,
4745                                                              &xdp, xdp_prog);
4746                                if (err)
4747                                        goto out_redir;
4748                                break;
4749                        case XDP_TX:
4750                                generic_xdp_tx(skb, xdp_prog);
4751                                break;
4752                        }
4753                        return XDP_DROP;
4754                }
4755        }
4756        return XDP_PASS;
4757out_redir:
4758        kfree_skb(skb);
4759        return XDP_DROP;
4760}
4761EXPORT_SYMBOL_GPL(do_xdp_generic);
4762
4763static int netif_rx_internal(struct sk_buff *skb)
4764{
4765        int ret;
4766
4767        net_timestamp_check(netdev_tstamp_prequeue, skb);
4768
4769        trace_netif_rx(skb);
4770
4771#ifdef CONFIG_RPS
4772        if (static_branch_unlikely(&rps_needed)) {
4773                struct rps_dev_flow voidflow, *rflow = &voidflow;
4774                int cpu;
4775
4776                preempt_disable();
4777                rcu_read_lock();
4778
4779                cpu = get_rps_cpu(skb->dev, skb, &rflow);
4780                if (cpu < 0)
4781                        cpu = smp_processor_id();
4782
4783                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4784
4785                rcu_read_unlock();
4786                preempt_enable();
4787        } else
4788#endif
4789        {
4790                unsigned int qtail;
4791
4792                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4793                put_cpu();
4794        }
4795        return ret;
4796}
4797
4798/**
4799 *      netif_rx        -       post buffer to the network code
4800 *      @skb: buffer to post
4801 *
4802 *      This function receives a packet from a device driver and queues it for
4803 *      the upper (protocol) levels to process.  It always succeeds. The buffer
4804 *      may be dropped during processing for congestion control or by the
4805 *      protocol layers.
4806 *
4807 *      return values:
4808 *      NET_RX_SUCCESS  (no congestion)
4809 *      NET_RX_DROP     (packet was dropped)
4810 *
4811 */
4812
4813int netif_rx(struct sk_buff *skb)
4814{
4815        int ret;
4816
4817        trace_netif_rx_entry(skb);
4818
4819        ret = netif_rx_internal(skb);
4820        trace_netif_rx_exit(ret);
4821
4822        return ret;
4823}
4824EXPORT_SYMBOL(netif_rx);
4825
4826int netif_rx_ni(struct sk_buff *skb)
4827{
4828        int err;
4829
4830        trace_netif_rx_ni_entry(skb);
4831
4832        preempt_disable();
4833        err = netif_rx_internal(skb);
4834        if (local_softirq_pending())
4835                do_softirq();
4836        preempt_enable();
4837        trace_netif_rx_ni_exit(err);
4838
4839        return err;
4840}
4841EXPORT_SYMBOL(netif_rx_ni);
4842
4843static __latent_entropy void net_tx_action(struct softirq_action *h)
4844{
4845        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4846
4847        if (sd->completion_queue) {
4848                struct sk_buff *clist;
4849
4850                local_irq_disable();
4851                clist = sd->completion_queue;
4852                sd->completion_queue = NULL;
4853                local_irq_enable();
4854
4855                while (clist) {
4856                        struct sk_buff *skb = clist;
4857
4858                        clist = clist->next;
4859
4860                        WARN_ON(refcount_read(&skb->users));
4861                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4862                                trace_consume_skb(skb);
4863                        else
4864                                trace_kfree_skb(skb, net_tx_action);
4865
4866                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4867                                __kfree_skb(skb);
4868                        else
4869                                __kfree_skb_defer(skb);
4870                }
4871
4872                __kfree_skb_flush();
4873        }
4874
4875        if (sd->output_queue) {
4876                struct Qdisc *head;
4877
4878                local_irq_disable();
4879                head = sd->output_queue;
4880                sd->output_queue = NULL;
4881                sd->output_queue_tailp = &sd->output_queue;
4882                local_irq_enable();
4883
4884                while (head) {
4885                        struct Qdisc *q = head;
4886                        spinlock_t *root_lock = NULL;
4887
4888                        head = head->next_sched;
4889
4890                        if (!(q->flags & TCQ_F_NOLOCK)) {
4891                                root_lock = qdisc_lock(q);
4892                                spin_lock(root_lock);
4893                        }
4894                        /* We need to make sure head->next_sched is read
4895                         * before clearing __QDISC_STATE_SCHED
4896                         */
4897                        smp_mb__before_atomic();
4898                        clear_bit(__QDISC_STATE_SCHED, &q->state);
4899                        qdisc_run(q);
4900                        if (root_lock)
4901                                spin_unlock(root_lock);
4902                }
4903        }
4904
4905        xfrm_dev_backlog(sd);
4906}
4907
4908#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4909/* This hook is defined here for ATM LANE */
4910int (*br_fdb_test_addr_hook)(struct net_device *dev,
4911                             unsigned char *addr) __read_mostly;
4912EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4913#endif
4914
4915static inline struct sk_buff *
4916sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4917                   struct net_device *orig_dev)
4918{
4919#ifdef CONFIG_NET_CLS_ACT
4920        struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4921        struct tcf_result cl_res;
4922
4923        /* If there's at least one ingress present somewhere (so
4924         * we get here via enabled static key), remaining devices
4925         * that are not configured with an ingress qdisc will bail
4926         * out here.
4927         */
4928        if (!miniq)
4929                return skb;
4930
4931        if (*pt_prev) {
4932                *ret = deliver_skb(skb, *pt_prev, orig_dev);
4933                *pt_prev = NULL;
4934        }
4935
4936        qdisc_skb_cb(skb)->pkt_len = skb->len;
4937        skb->tc_at_ingress = 1;
4938        mini_qdisc_bstats_cpu_update(miniq, skb);
4939
4940        switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
4941                                     &cl_res, false)) {
4942        case TC_ACT_OK:
4943        case TC_ACT_RECLASSIFY:
4944                skb->tc_index = TC_H_MIN(cl_res.classid);
4945                break;
4946        case TC_ACT_SHOT:
4947                mini_qdisc_qstats_cpu_drop(miniq);
4948                kfree_skb(skb);
4949                return NULL;
4950        case TC_ACT_STOLEN:
4951        case TC_ACT_QUEUED:
4952        case TC_ACT_TRAP:
4953                consume_skb(skb);
4954                return NULL;
4955        case TC_ACT_REDIRECT:
4956                /* skb_mac_header check was done by cls/act_bpf, so
4957                 * we can safely push the L2 header back before
4958                 * redirecting to another netdev
4959                 */
4960                __skb_push(skb, skb->mac_len);
4961                skb_do_redirect(skb);
4962                return NULL;
4963        case TC_ACT_CONSUMED:
4964                return NULL;
4965        default:
4966                break;
4967        }
4968#endif /* CONFIG_NET_CLS_ACT */
4969        return skb;
4970}
4971
4972/**
4973 *      netdev_is_rx_handler_busy - check if receive handler is registered
4974 *      @dev: device to check
4975 *
4976 *      Check if a receive handler is already registered for a given device.
4977 *      Return true if there one.
4978 *
4979 *      The caller must hold the rtnl_mutex.
4980 */
4981bool netdev_is_rx_handler_busy(struct net_device *dev)
4982{
4983        ASSERT_RTNL();
4984        return dev && rtnl_dereference(dev->rx_handler);
4985}
4986EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4987
4988/**
4989 *      netdev_rx_handler_register - register receive handler
4990 *      @dev: device to register a handler for
4991 *      @rx_handler: receive handler to register
4992 *      @rx_handler_data: data pointer that is used by rx handler
4993 *
4994 *      Register a receive handler for a device. This handler will then be
4995 *      called from __netif_receive_skb. A negative errno code is returned
4996 *      on a failure.
4997 *
4998 *      The caller must hold the rtnl_mutex.
4999 *
5000 *      For a general description of rx_handler, see enum rx_handler_result.

5001 */
5002int netdev_rx_handler_register(struct net_device *dev,
5003                               rx_handler_func_t *rx_handler,
5004                               void *rx_handler_data)
5005{
5006        if (netdev_is_rx_handler_busy(dev))
5007                return -EBUSY;
5008
5009        if (dev->priv_flags & IFF_NO_RX_HANDLER)
5010                return -EINVAL;
5011
5012        /* Note: rx_handler_data must be set before rx_handler */
5013        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5014        rcu_assign_pointer(dev->rx_handler, rx_handler);
5015
5016        return 0;
5017}
5018EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5019
5020/**
5021 *      netdev_rx_handler_unregister - unregister receive handler
5022 *      @dev: device to unregister a handler from
5023 *
5024 *      Unregister a receive handler from a device.
5025 *
5026 *      The caller must hold the rtnl_mutex.
5027 */
5028void netdev_rx_handler_unregister(struct net_device *dev)
5029{
5030
5031        ASSERT_RTNL();
5032        RCU_INIT_POINTER(dev->rx_handler, NULL);
5033        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5034         * section has a guarantee to see a non NULL rx_handler_data
5035         * as well.
5036         */
5037        synchronize_net();
5038        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5039}
5040EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5041
5042/*
5043 * Limit the use of PFMEMALLOC reserves to those protocols that implement
5044 * the special handling of PFMEMALLOC skbs.
5045 */
5046static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5047{
5048        switch (skb->protocol) {
5049        case htons(ETH_P_ARP):
5050        case htons(ETH_P_IP):
5051        case htons(ETH_P_IPV6):
5052        case htons(ETH_P_8021Q):
5053        case htons(ETH_P_8021AD):
5054                return true;
5055        default:
5056                return false;
5057        }
5058}
5059
5060static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5061                             int *ret, struct net_device *orig_dev)
5062{
5063        if (nf_hook_ingress_active(skb)) {
5064                int ingress_retval;
5065
5066                if (*pt_prev) {
5067                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
5068                        *pt_prev = NULL;
5069                }
5070
5071                rcu_read_lock();
5072                ingress_retval = nf_hook_ingress(skb);
5073                rcu_read_unlock();
5074                return ingress_retval;
5075        }
5076        return 0;
5077}
5078
5079static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5080                                    struct packet_type **ppt_prev)
5081{
5082        struct packet_type *ptype, *pt_prev;
5083        rx_handler_func_t *rx_handler;
5084        struct sk_buff *skb = *pskb;
5085        struct net_device *orig_dev;
5086        bool deliver_exact = false;
5087        int ret = NET_RX_DROP;
5088        __be16 type;
5089
5090        net_timestamp_check(!netdev_tstamp_prequeue, skb);
5091
5092        trace_netif_receive_skb(skb);
5093
5094        orig_dev = skb->dev;
5095
5096        skb_reset_network_header(skb);
5097        if (!skb_transport_header_was_set(skb))
5098                skb_reset_transport_header(skb);
5099        skb_reset_mac_len(skb);
5100
5101        pt_prev = NULL;
5102
5103another_round:
5104        skb->skb_iif = skb->dev->ifindex;
5105
5106        __this_cpu_inc(softnet_data.processed);
5107
5108        if (static_branch_unlikely(&generic_xdp_needed_key)) {
5109                int ret2;
5110
5111                preempt_disable();
5112                ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
5113                preempt_enable();
5114
5115                if (ret2 != XDP_PASS) {
5116                        ret = NET_RX_DROP;
5117                        goto out;
5118                }
5119                skb_reset_mac_len(skb);
5120        }
5121
5122        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5123            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5124                skb = skb_vlan_untag(skb);
5125                if (unlikely(!skb))
5126                        goto out;
5127        }
5128
5129        if (skb_skip_tc_classify(skb))
5130                goto skip_classify;
5131
5132        if (pfmemalloc)
5133                goto skip_taps;
5134
5135        list_for_each_entry_rcu(ptype, &ptype_all, list) {
5136                if (pt_prev)
5137                        ret = deliver_skb(skb, pt_prev, orig_dev);
5138                pt_prev = ptype;
5139        }
5140
5141        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5142                if (pt_prev)
5143                        ret = deliver_skb(skb, pt_prev, orig_dev);
5144                pt_prev = ptype;
5145        }
5146
5147skip_taps:
5148#ifdef CONFIG_NET_INGRESS
5149        if (static_branch_unlikely(&ingress_needed_key)) {
5150                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
5151                if (!skb)
5152                        goto out;
5153
5154                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5155                        goto out;
5156        }
5157#endif
5158        skb_reset_redirect(skb);
5159skip_classify:
5160        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5161                goto drop;
5162
5163        if (skb_vlan_tag_present(skb)) {
5164                if (pt_prev) {
5165                        ret = deliver_skb(skb, pt_prev, orig_dev);
5166                        pt_prev = NULL;
5167                }
5168                if (vlan_do_receive(&skb))
5169                        goto another_round;
5170                else if (unlikely(!skb))
5171                        goto out;
5172        }
5173
5174        rx_handler = rcu_dereference(skb->dev->rx_handler);
5175        if (rx_handler) {
5176                if (pt_prev) {
5177                        ret = deliver_skb(skb, pt_prev, orig_dev);
5178                        pt_prev = NULL;
5179                }
5180                switch (rx_handler(&skb)) {
5181                case RX_HANDLER_CONSUMED:
5182                        ret = NET_RX_SUCCESS;
5183                        goto out;
5184                case RX_HANDLER_ANOTHER:
5185                        goto another_round;
5186                case RX_HANDLER_EXACT:
5187                        deliver_exact = true;
5188                case RX_HANDLER_PASS:
5189                        break;
5190                default:
5191                        BUG();
5192                }
5193        }
5194
5195        if (unlikely(skb_vlan_tag_present(skb))) {
5196check_vlan_id:
5197                if (skb_vlan_tag_get_id(skb)) {
5198                        /* Vlan id is non 0 and vlan_do_receive() above couldn't
5199                         * find vlan device.
5200                         */
5201                        skb->pkt_type = PACKET_OTHERHOST;
5202                } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5203                           skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5204                        /* Outer header is 802.1P with vlan 0, inner header is
5205                         * 802.1Q or 802.1AD and vlan_do_receive() above could
5206                         * not find vlan dev for vlan id 0.
5207                         */
5208                        __vlan_hwaccel_clear_tag(skb);
5209                        skb = skb_vlan_untag(skb);
5210                        if (unlikely(!skb))
5211                                goto out;
5212                        if (vlan_do_receive(&skb))
5213                                /* After stripping off 802.1P header with vlan 0
5214                                 * vlan dev is found for inner header.
5215                                 */
5216                                goto another_round;
5217                        else if (unlikely(!skb))
5218                                goto out;
5219                        else
5220                                /* We have stripped outer 802.1P vlan 0 header.
5221                                 * But could not find vlan dev.
5222                                 * check again for vlan id to set OTHERHOST.
5223                                 */
5224                                goto check_vlan_id;
5225                }
5226                /* Note: we might in the future use prio bits
5227                 * and set skb->priority like in vlan_do_receive()
5228                 * For the time being, just ignore Priority Code Point
5229                 */
5230                __vlan_hwaccel_clear_tag(skb);
5231        }
5232
5233        type = skb->protocol;
5234
5235        /* deliver only exact match when indicated */
5236        if (likely(!deliver_exact)) {
5237                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5238                                       &ptype_base[ntohs(type) &
5239                                                   PTYPE_HASH_MASK]);
5240        }
5241
5242        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5243                               &orig_dev->ptype_specific);
5244
5245        if (unlikely(skb->dev != orig_dev)) {
5246                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5247                                       &skb->dev->ptype_specific);
5248        }
5249
5250        if (pt_prev) {
5251                if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5252                        goto drop;
5253                *ppt_prev = pt_prev;
5254        } else {
5255drop:
5256                if (!deliver_exact)
5257                        atomic_long_inc(&skb->dev->rx_dropped);
5258                else
5259                        atomic_long_inc(&skb->dev->rx_nohandler);
5260                kfree_skb(skb);
5261                /* Jamal, now you will not able to escape explaining
5262                 * me how you were going to use this. :-)
5263                 */
5264                ret = NET_RX_DROP;
5265        }
5266
5267out:
5268        /* The invariant here is that if *ppt_prev is not NULL
5269         * then skb should also be non-NULL.
5270         *
5271         * Apparently *ppt_prev assignment above holds this invariant due to
5272         * skb dereferencing near it.
5273         */
5274        *pskb = skb;
5275        return ret;
5276}
5277
5278static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5279{
5280        struct net_device *orig_dev = skb->dev;
5281        struct packet_type *pt_prev = NULL;
5282        int ret;
5283
5284        ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5285        if (pt_prev)
5286                ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5287                                         skb->dev, pt_prev, orig_dev);
5288        return ret;
5289}
5290
5291/**
5292 *      netif_receive_skb_core - special purpose version of netif_receive_skb
5293 *      @skb: buffer to process
5294 *
5295 *      More direct receive version of netif_receive_skb().  It should
5296 *      only be used by callers that have a need to skip RPS and Generic XDP.
5297 *      Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5298 *
5299 *      This function may only be called from softirq context and interrupts
5300 *      should be enabled.
5301 *
5302 *      Return values (usually ignored):
5303 *      NET_RX_SUCCESS: no congestion
5304 *      NET_RX_DROP: packet was dropped
5305 */
5306int netif_receive_skb_core(struct sk_buff *skb)
5307{
5308        int ret;
5309
5310        rcu_read_lock();
5311        ret = __netif_receive_skb_one_core(skb, false);
5312        rcu_read_unlock();
5313
5314        return ret;
5315}
5316EXPORT_SYMBOL(netif_receive_skb_core);
5317
5318static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5319                                                  struct packet_type *pt_prev,
5320                                                  struct net_device *orig_dev)
5321{
5322        struct sk_buff *skb, *next;
5323
5324        if (!pt_prev)
5325                return;
5326        if (list_empty(head))
5327                return;
5328        if (pt_prev->list_func != NULL)
5329                INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5330                                   ip_list_rcv, head, pt_prev, orig_dev);
5331        else
5332                list_for_each_entry_safe(skb, next, head, list) {
5333                        skb_list_del_init(skb);
5334                        pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5335                }
5336}
5337
5338static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5339{
5340        /* Fast-path assumptions:
5341         * - There is no RX handler.
5342         * - Only one packet_type matches.
5343         * If either of these fails, we will end up doing some per-packet
5344         * processing in-line, then handling the 'last ptype' for the whole
5345         * sublist.  This can't cause out-of-order delivery to any single ptype,
5346         * because the 'last ptype' must be constant across the sublist, and all
5347         * other ptypes are handled per-packet.
5348         */
5349        /* Current (common) ptype of sublist */
5350        struct packet_type *pt_curr = NULL;
5351        /* Current (common) orig_dev of sublist */
5352        struct net_device *od_curr = NULL;
5353        struct list_head sublist;
5354        struct sk_buff *skb, *next;
5355
5356        INIT_LIST_HEAD(&sublist);
5357        list_for_each_entry_safe(skb, next, head, list) {
5358                struct net_device *orig_dev = skb->dev;
5359                struct packet_type *pt_prev = NULL;
5360
5361                skb_list_del_init(skb);
5362                __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5363                if (!pt_prev)
5364                        continue;
5365                if (pt_curr != pt_prev || od_curr != orig_dev) {
5366                        /* dispatch old sublist */
5367                        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5368                        /* start new sublist */
5369                        INIT_LIST_HEAD(&sublist);
5370                        pt_curr = pt_prev;
5371                        od_curr = orig_dev;
5372                }
5373                list_add_tail(&skb->list, &sublist);
5374        }
5375
5376        /* dispatch final sublist */
5377        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5378}
5379
5380static int __netif_receive_skb(struct sk_buff *skb)
5381{
5382        int ret;
5383
5384        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5385                unsigned int noreclaim_flag;
5386
5387                /*
5388                 * PFMEMALLOC skbs are special, they should
5389                 * - be delivered to SOCK_MEMALLOC sockets only
5390                 * - stay away from userspace
5391                 * - have bounded memory usage
5392                 *
5393                 * Use PF_MEMALLOC as this saves us from propagating the allocation
5394                 * context down to all allocation sites.
5395                 */
5396                noreclaim_flag = memalloc_noreclaim_save();
5397                ret = __netif_receive_skb_one_core(skb, true);
5398                memalloc_noreclaim_restore(noreclaim_flag);
5399        } else
5400                ret = __netif_receive_skb_one_core(skb, false);
5401
5402        return ret;
5403}
5404
5405static void __netif_receive_skb_list(struct list_head *head)
5406{
5407        unsigned long noreclaim_flag = 0;
5408        struct sk_buff *skb, *next;
5409        bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5410
5411        list_for_each_entry_safe(skb, next, head, list) {
5412                if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5413                        struct list_head sublist;
5414
5415                        /* Handle the previous sublist */
5416                        list_cut_before(&sublist, head, &skb->list);
5417                        if (!list_empty(&sublist))
5418                                __netif_receive_skb_list_core(&sublist, pfmemalloc);
5419                        pfmemalloc = !pfmemalloc;
5420                        /* See comments in __netif_receive_skb */
5421                        if (pfmemalloc)
5422                                noreclaim_flag = memalloc_noreclaim_save();
5423                        else
5424                                memalloc_noreclaim_restore(noreclaim_flag);
5425                }
5426        }
5427        /* Handle the remaining sublist */
5428        if (!list_empty(head))
5429                __netif_receive_skb_list_core(head, pfmemalloc);
5430        /* Restore pflags */
5431        if (pfmemalloc)
5432                memalloc_noreclaim_restore(noreclaim_flag);
5433}
5434
5435static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5436{
5437        struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5438        struct bpf_prog *new = xdp->prog;
5439        int ret = 0;
5440
5441        if (new) {
5442                u32 i;
5443
5444                /* generic XDP does not work with DEVMAPs that can
5445                 * have a bpf_prog installed on an entry
5446                 */
5447                for (i = 0; i < new->aux->used_map_cnt; i++) {
5448                        if (dev_map_can_have_prog(new->aux->used_maps[i]))
5449                                return -EINVAL;
5450                        if (cpu_map_prog_allowed(new->aux->used_maps[i]))
5451                                return -EINVAL;
5452                }
5453        }
5454
5455        switch (xdp->command) {
5456        case XDP_SETUP_PROG:
5457                rcu_assign_pointer(dev->xdp_prog, new);
5458                if (old)
5459                        bpf_prog_put(old);
5460
5461                if (old && !new) {
5462                        static_branch_dec(&generic_xdp_needed_key);
5463                } else if (new && !old) {
5464                        static_branch_inc(&generic_xdp_needed_key);
5465                        dev_disable_lro(dev);
5466                        dev_disable_gro_hw(dev);
5467                }
5468                break;
5469
5470        default:
5471                ret = -EINVAL;
5472                break;
5473        }
5474
5475        return ret;
5476}
5477
5478static int netif_receive_skb_internal(struct sk_buff *skb)
5479{
5480        int ret;
5481
5482        net_timestamp_check(netdev_tstamp_prequeue, skb);
5483
5484        if (skb_defer_rx_timestamp(skb))
5485                return NET_RX_SUCCESS;
5486
5487        rcu_read_lock();
5488#ifdef CONFIG_RPS
5489        if (static_branch_unlikely(&rps_needed)) {
5490                struct rps_dev_flow voidflow, *rflow = &voidflow;
5491                int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5492
5493                if (cpu >= 0) {
5494                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5495                        rcu_read_unlock();
5496                        return ret;
5497                }
5498        }
5499#endif
5500        ret = __netif_receive_skb(skb);
5501        rcu_read_unlock();
5502        return ret;
5503}
5504
5505static void netif_receive_skb_list_internal(struct list_head *head)
5506{
5507        struct sk_buff *skb, *next;
5508        struct list_head sublist;
5509
5510        INIT_LIST_HEAD(&sublist);
5511        list_for_each_entry_safe(skb, next, head, list) {
5512                net_timestamp_check(netdev_tstamp_prequeue, skb);
5513                skb_list_del_init(skb);
5514                if (!skb_defer_rx_timestamp(skb))
5515                        list_add_tail(&skb->list, &sublist);
5516        }
5517        list_splice_init(&sublist, head);
5518
5519        rcu_read_lock();
5520#ifdef CONFIG_RPS
5521        if (static_branch_unlikely(&rps_needed)) {
5522                list_for_each_entry_safe(skb, next, head, list) {
5523                        struct rps_dev_flow voidflow, *rflow = &voidflow;
5524                        int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5525
5526                        if (cpu >= 0) {
5527                                /* Will be handled, remove from list */
5528                                skb_list_del_init(skb);
5529                                enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5530                        }
5531                }
5532        }
5533#endif
5534        __netif_receive_skb_list(head);
5535        rcu_read_unlock();
5536}
5537
5538/**
5539 *      netif_receive_skb - process receive buffer from network
5540 *      @skb: buffer to process
5541 *
5542 *      netif_receive_skb() is the main receive data processing function.
5543 *      It always succeeds. The buffer may be dropped during processing
5544 *      for congestion control or by the protocol layers.
5545 *
5546 *      This function may only be called from softirq context and interrupts
5547 *      should be enabled.
5548 *
5549 *      Return values (usually ignored):
5550 *      NET_RX_SUCCESS: no congestion
5551 *      NET_RX_DROP: packet was dropped
5552 */
5553int netif_receive_skb(struct sk_buff *skb)
5554{
5555        int ret;
5556
5557        trace_netif_receive_skb_entry(skb);
5558
5559        ret = netif_receive_skb_internal(skb);
5560        trace_netif_receive_skb_exit(ret);
5561
5562        return ret;
5563}
5564EXPORT_SYMBOL(netif_receive_skb);
5565
5566/**
5567 *      netif_receive_skb_list - process many receive buffers from network
5568 *      @head: list of skbs to process.
5569 *
5570 *      Since return value of netif_receive_skb() is normally ignored, and
5571 *      wouldn't be meaningful for a list, this function returns void.
5572 *
5573 *      This function may only be called from softirq context and interrupts
5574 *      should be enabled.
5575 */
5576void netif_receive_skb_list(struct list_head *head)
5577{
5578        struct sk_buff *skb;
5579
5580        if (list_empty(head))
5581                return;
5582        if (trace_netif_receive_skb_list_entry_enabled()) {
5583                list_for_each_entry(skb, head, list)
5584                        trace_netif_receive_skb_list_entry(skb);
5585        }
5586        netif_receive_skb_list_internal(head);
5587        trace_netif_receive_skb_list_exit(0);
5588}
5589EXPORT_SYMBOL(netif_receive_skb_list);
5590
5591static DEFINE_PER_CPU(struct work_struct, flush_works);
5592
5593/* Network device is going away, flush any packets still pending */
5594static void flush_backlog(struct work_struct *work)
5595{
5596        struct sk_buff *skb, *tmp;
5597        struct softnet_data *sd;
5598
5599        local_bh_disable();
5600        sd = this_cpu_ptr(&softnet_data);
5601
5602        local_irq_disable();
5603        rps_lock(sd);
5604        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5605                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5606                        __skb_unlink(skb, &sd->input_pkt_queue);
5607                        dev_kfree_skb_irq(skb);
5608                        input_queue_head_incr(sd);
5609                }
5610        }
5611        rps_unlock(sd);
5612        local_irq_enable();
5613
5614        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5615                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5616                        __skb_unlink(skb, &sd->process_queue);
5617                        kfree_skb(skb);
5618                        input_queue_head_incr(sd);
5619                }
5620        }
5621        local_bh_enable();
5622}
5623
5624static void flush_all_backlogs(void)
5625{
5626        unsigned int cpu;
5627
5628        get_online_cpus();
5629
5630        for_each_online_cpu(cpu)
5631                queue_work_on(cpu, system_highpri_wq,
5632                              per_cpu_ptr(&flush_works, cpu));
5633
5634        for_each_online_cpu(cpu)
5635                flush_work(per_cpu_ptr(&flush_works, cpu));
5636
5637        put_online_cpus();
5638}
5639
5640/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
5641static void gro_normal_list(struct napi_struct *napi)
5642{
5643        if (!napi->rx_count)
5644                return;
5645        netif_receive_skb_list_internal(&napi->rx_list);
5646        INIT_LIST_HEAD(&napi->rx_list);
5647        napi->rx_count = 0;
5648}
5649
5650/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
5651 * pass the whole batch up to the stack.
5652 */
5653static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
5654{
5655        list_add_tail(&skb->list, &napi->rx_list);
5656        if (++napi->rx_count >= gro_normal_batch)
5657                gro_normal_list(napi);
5658}
5659
5660INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
5661INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
5662static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
5663{
5664        struct packet_offload *ptype;
5665        __be16 type = skb->protocol;
5666        struct list_head *head = &offload_base;
5667        int err = -ENOENT;
5668
5669        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
5670
5671        if (NAPI_GRO_CB(skb)->count == 1) {
5672                skb_shinfo(skb)->gso_size = 0;
5673                goto out;
5674        }
5675
5676        rcu_read_lock();
5677        list_for_each_entry_rcu(ptype, head, list) {
5678                if (ptype->type != type || !ptype->callbacks.gro_complete)
5679                        continue;
5680
5681                err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
5682                                         ipv6_gro_complete, inet_gro_complete,
5683                                         skb, 0);
5684                break;
5685        }
5686        rcu_read_unlock();
5687
5688        if (err) {
5689                WARN_ON(&ptype->list == head);
5690                kfree_skb(skb);
5691                return NET_RX_SUCCESS;
5692        }
5693
5694out:
5695        gro_normal_one(napi, skb);
5696        return NET_RX_SUCCESS;
5697}
5698
5699static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
5700                                   bool flush_old)
5701{
5702        struct list_head *head = &napi->gro_hash[index].list;
5703        struct sk_buff *skb, *p;
5704
5705        list_for_each_entry_safe_reverse(skb, p, head, list) {
5706                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
5707                        return;
5708                skb_list_del_init(skb);
5709                napi_gro_complete(napi, skb);
5710                napi->gro_hash[index].count--;
5711        }
5712
5713        if (!napi->gro_hash[index].count)
5714                __clear_bit(index, &napi->gro_bitmask);
5715}
5716
5717/* napi->gro_hash[].list contains packets ordered by age.
5718 * youngest packets at the head of it.
5719 * Complete skbs in reverse order to reduce latencies.
5720 */
5721void napi_gro_flush(struct napi_struct *napi, bool flush_old)
5722{
5723        unsigned long bitmask = napi->gro_bitmask;
5724        unsigned int i, base = ~0U;
5725
5726        while ((i = ffs(bitmask)) != 0) {
5727                bitmask >>= i;
5728                base += i;
5729                __napi_gro_flush_chain(napi, base, flush_old);
5730        }
5731}
5732EXPORT_SYMBOL(napi_gro_flush);
5733
5734static struct list_head *gro_list_prepare(struct napi_struct *napi,
5735                                          struct sk_buff *skb)
5736{
5737        unsigned int maclen = skb->dev->hard_header_len;
5738        u32 hash = skb_get_hash_raw(skb);
5739        struct list_head *head;
5740        struct sk_buff *p;
5741
5742        head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
5743        list_for_each_entry(p, head, list) {
5744                unsigned long diffs;
5745
5746                NAPI_GRO_CB(p)->flush = 0;
5747
5748                if (hash != skb_get_hash_raw(p)) {
5749                        NAPI_GRO_CB(p)->same_flow = 0;
5750                        continue;
5751                }
5752
5753                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5754                diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
5755                if (skb_vlan_tag_present(p))
5756                        diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
5757                diffs |= skb_metadata_dst_cmp(p, skb);
5758                diffs |= skb_metadata_differs(p, skb);
5759                if (maclen == ETH_HLEN)
5760                        diffs |= compare_ether_header(skb_mac_header(p),
5761                                                      skb_mac_header(skb));
5762                else if (!diffs)
5763                        diffs = memcmp(skb_mac_header(p),
5764                                       skb_mac_header(skb),
5765                                       maclen);
5766                NAPI_GRO_CB(p)->same_flow = !diffs;
5767        }
5768
5769        return head;
5770}
5771
5772static void skb_gro_reset_offset(struct sk_buff *skb)
5773{
5774        const struct skb_shared_info *pinfo = skb_shinfo(skb);
5775        const skb_frag_t *frag0 = &pinfo->frags[0];
5776
5777        NAPI_GRO_CB(skb)->data_offset = 0;
5778        NAPI_GRO_CB(skb)->frag0 = NULL;
5779        NAPI_GRO_CB(skb)->frag0_len = 0;
5780
5781        if (!skb_headlen(skb) && pinfo->nr_frags &&
5782            !PageHighMem(skb_frag_page(frag0))) {
5783                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
5784                NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
5785                                                    skb_frag_size(frag0),
5786                                                    skb->end - skb->tail);
5787        }
5788}
5789
5790static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
5791{
5792        struct skb_shared_info *pinfo = skb_shinfo(skb);
5793
5794        BUG_ON(skb->end - skb->tail < grow);
5795
5796        memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
5797
5798        skb->data_len -= grow;
5799        skb->tail += grow;
5800
5801        skb_frag_off_add(&pinfo->frags[0], grow);
5802        skb_frag_size_sub(&pinfo->frags[0], grow);
5803
5804        if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
5805                skb_frag_unref(skb, 0);
5806                memmove(pinfo->frags, pinfo->frags + 1,
5807                        --pinfo->nr_frags * sizeof(pinfo->frags[0]));
5808        }
5809}
5810
5811static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
5812{
5813        struct sk_buff *oldest;
5814
5815        oldest = list_last_entry(head, struct sk_buff, list);
5816
5817        /* We are called with head length >= MAX_GRO_SKBS, so this is
5818         * impossible.
5819         */
5820        if (WARN_ON_ONCE(!oldest))
5821                return;
5822
5823        /* Do not adjust napi->gro_hash[].count, caller is adding a new
5824         * SKB to the chain.
5825         */
5826        skb_list_del_init(oldest);
5827        napi_gro_complete(napi, oldest);
5828}
5829
5830INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
5831                                                           struct sk_buff *));
5832INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
5833                                                           struct sk_buff *));
5834static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5835{
5836        u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
5837        struct list_head *head = &offload_base;
5838        struct packet_offload *ptype;
5839        __be16 type = skb->protocol;
5840        struct list_head *gro_head;
5841        struct sk_buff *pp = NULL;
5842        enum gro_result ret;
5843        int same_flow;
5844        int grow;
5845
5846        if (netif_elide_gro(skb->dev))
5847                goto normal;
5848
5849        gro_head = gro_list_prepare(napi, skb);
5850
5851        rcu_read_lock();
5852        list_for_each_entry_rcu(ptype, head, list) {
5853                if (ptype->type != type || !ptype->callbacks.gro_receive)
5854                        continue;
5855
5856                skb_set_network_header(skb, skb_gro_offset(skb));
5857                skb_reset_mac_len(skb);
5858                NAPI_GRO_CB(skb)->same_flow = 0;
5859                NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
5860                NAPI_GRO_CB(skb)->free = 0;
5861                NAPI_GRO_CB(skb)->encap_mark = 0;
5862                NAPI_GRO_CB(skb)->recursion_counter = 0;
5863                NAPI_GRO_CB(skb)->is_fou = 0;
5864                NAPI_GRO_CB(skb)->is_atomic = 1;
5865                NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
5866
5867                /* Setup for GRO checksum validation */
5868                switch (skb->ip_summed) {
5869                case CHECKSUM_COMPLETE:
5870                        NAPI_GRO_CB(skb)->csum = skb->csum;
5871                        NAPI_GRO_CB(skb)->csum_valid = 1;
5872                        NAPI_GRO_CB(skb)->csum_cnt = 0;
5873                        break;
5874                case CHECKSUM_UNNECESSARY:
5875                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
5876                        NAPI_GRO_CB(skb)->csum_valid = 0;
5877                        break;
5878                default:
5879                        NAPI_GRO_CB(skb)->csum_cnt = 0;
5880                        NAPI_GRO_CB(skb)->csum_valid = 0;
5881                }
5882
5883                pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
5884                                        ipv6_gro_receive, inet_gro_receive,
5885                                        gro_head, skb);
5886                break;
5887        }
5888        rcu_read_unlock();
5889
5890        if (&ptype->list == head)
5891                goto normal;
5892
5893        if (PTR_ERR(pp) == -EINPROGRESS) {
5894                ret = GRO_CONSUMED;
5895                goto ok;
5896        }
5897
5898        same_flow = NAPI_GRO_CB(skb)->same_flow;
5899        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
5900
5901        if (pp) {
5902                skb_list_del_init(pp);
5903                napi_gro_complete(napi, pp);
5904                napi->gro_hash[hash].count--;
5905        }
5906
5907        if (same_flow)
5908                goto ok;
5909
5910        if (NAPI_GRO_CB(skb)->flush)
5911                goto normal;
5912
5913        if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
5914                gro_flush_oldest(napi, gro_head);
5915        } else {
5916                napi->gro_hash[hash].count++;
5917        }
5918        NAPI_GRO_CB(skb)->count = 1;
5919        NAPI_GRO_CB(skb)->age = jiffies;
5920        NAPI_GRO_CB(skb)->last = skb;
5921        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
5922        list_add(&skb->list, gro_head);
5923        ret = GRO_HELD;
5924
5925pull:
5926        grow = skb_gro_offset(skb) - skb_headlen(skb);
5927        if (grow > 0)
5928                gro_pull_from_frag0(skb, grow);
5929ok:
5930        if (napi->gro_hash[hash].count) {
5931                if (!test_bit(hash, &napi->gro_bitmask))
5932                        __set_bit(hash, &napi->gro_bitmask);
5933        } else if (test_bit(hash, &napi->gro_bitmask)) {
5934                __clear_bit(hash, &napi->gro_bitmask);
5935        }
5936
5937        return ret;
5938
5939normal:
5940        ret = GRO_NORMAL;
5941        goto pull;
5942}
5943
5944struct packet_offload *gro_find_receive_by_type(__be16 type)
5945{
5946        struct list_head *offload_head = &offload_base;
5947        struct packet_offload *ptype;
5948
5949        list_for_each_entry_rcu(ptype, offload_head, list) {
5950                if (ptype->type != type || !ptype->callbacks.gro_receive)
5951                        continue;
5952                return ptype;
5953        }
5954        return NULL;
5955}
5956EXPORT_SYMBOL(gro_find_receive_by_type);
5957
5958struct packet_offload *gro_find_complete_by_type(__be16 type)
5959{
5960        struct list_head *offload_head = &offload_base;
5961        struct packet_offload *ptype;
5962
5963        list_for_each_entry_rcu(ptype, offload_head, list) {
5964                if (ptype->type != type || !ptype->callbacks.gro_complete)
5965                        continue;
5966                return ptype;
5967        }
5968        return NULL;
5969}
5970EXPORT_SYMBOL(gro_find_complete_by_type);
5971
5972static void napi_skb_free_stolen_head(struct sk_buff *skb)
5973{
5974        skb_dst_drop(skb);
5975        skb_ext_put(skb);
5976        kmem_cache_free(skbuff_head_cache, skb);
5977}
5978
5979static gro_result_t napi_skb_finish(struct napi_struct *napi,
5980                                    struct sk_buff *skb,
5981                                    gro_result_t ret)
5982{
5983        switch (ret) {
5984        case GRO_NORMAL:
5985                gro_normal_one(napi, skb);
5986                break;
5987
5988        case GRO_DROP:
5989                kfree_skb(skb);
5990                break;
5991
5992        case GRO_MERGED_FREE:
5993                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5994                        napi_skb_free_stolen_head(skb);
5995                else
5996                        __kfree_skb(skb);
5997                break;
5998
5999        case GRO_HELD:
6000        case GRO_MERGED:

6001        case GRO_CONSUMED:
6002                break;
6003        }
6004
6005        return ret;
6006}
6007
6008gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
6009{
6010        gro_result_t ret;
6011
6012        skb_mark_napi_id(skb, napi);
6013        trace_napi_gro_receive_entry(skb);
6014
6015        skb_gro_reset_offset(skb);
6016
6017        ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
6018        trace_napi_gro_receive_exit(ret);
6019
6020        return ret;
6021}
6022EXPORT_SYMBOL(napi_gro_receive);
6023
6024static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
6025{
6026        if (unlikely(skb->pfmemalloc)) {
6027                consume_skb(skb);
6028                return;
6029        }
6030        __skb_pull(skb, skb_headlen(skb));
6031        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
6032        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
6033        __vlan_hwaccel_clear_tag(skb);
6034        skb->dev = napi->dev;
6035        skb->skb_iif = 0;
6036
6037        /* eth_type_trans() assumes pkt_type is PACKET_HOST */
6038        skb->pkt_type = PACKET_HOST;
6039
6040        skb->encapsulation = 0;
6041        skb_shinfo(skb)->gso_type = 0;
6042        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
6043        skb_ext_reset(skb);
6044
6045        napi->skb = skb;
6046}
6047
6048struct sk_buff *napi_get_frags(struct napi_struct *napi)
6049{
6050        struct sk_buff *skb = napi->skb;
6051
6052        if (!skb) {
6053                skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
6054                if (skb) {
6055                        napi->skb = skb;
6056                        skb_mark_napi_id(skb, napi);
6057                }
6058        }
6059        return skb;
6060}
6061EXPORT_SYMBOL(napi_get_frags);
6062
6063static gro_result_t napi_frags_finish(struct napi_struct *napi,
6064                                      struct sk_buff *skb,
6065                                      gro_result_t ret)
6066{
6067        switch (ret) {
6068        case GRO_NORMAL:
6069        case GRO_HELD:
6070                __skb_push(skb, ETH_HLEN);
6071                skb->protocol = eth_type_trans(skb, skb->dev);
6072                if (ret == GRO_NORMAL)
6073                        gro_normal_one(napi, skb);
6074                break;
6075
6076        case GRO_DROP:
6077                napi_reuse_skb(napi, skb);
6078                break;
6079
6080        case GRO_MERGED_FREE:
6081                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
6082                        napi_skb_free_stolen_head(skb);
6083                else
6084                        napi_reuse_skb(napi, skb);
6085                break;
6086
6087        case GRO_MERGED:
6088        case GRO_CONSUMED:
6089                break;
6090        }
6091
6092        return ret;
6093}
6094
6095/* Upper GRO stack assumes network header starts at gro_offset=0
6096 * Drivers could call both napi_gro_frags() and napi_gro_receive()
6097 * We copy ethernet header into skb->data to have a common layout.
6098 */
6099static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
6100{
6101        struct sk_buff *skb = napi->skb;
6102        const struct ethhdr *eth;
6103        unsigned int hlen = sizeof(*eth);
6104
6105        napi->skb = NULL;
6106
6107        skb_reset_mac_header(skb);
6108        skb_gro_reset_offset(skb);
6109
6110        if (unlikely(skb_gro_header_hard(skb, hlen))) {
6111                eth = skb_gro_header_slow(skb, hlen, 0);
6112                if (unlikely(!eth)) {
6113                        net_warn_ratelimited("%s: dropping impossible skb from %s\n",
6114                                             __func__, napi->dev->name);
6115                        napi_reuse_skb(napi, skb);
6116                        return NULL;
6117                }
6118        } else {
6119                eth = (const struct ethhdr *)skb->data;
6120                gro_pull_from_frag0(skb, hlen);
6121                NAPI_GRO_CB(skb)->frag0 += hlen;
6122                NAPI_GRO_CB(skb)->frag0_len -= hlen;
6123        }
6124        __skb_pull(skb, hlen);
6125
6126        /*
6127         * This works because the only protocols we care about don't require
6128         * special handling.
6129         * We'll fix it up properly in napi_frags_finish()
6130         */
6131        skb->protocol = eth->h_proto;
6132
6133        return skb;
6134}
6135
6136gro_result_t napi_gro_frags(struct napi_struct *napi)
6137{
6138        gro_result_t ret;
6139        struct sk_buff *skb = napi_frags_skb(napi);
6140
6141        if (!skb)
6142                return GRO_DROP;
6143
6144        trace_napi_gro_frags_entry(skb);
6145
6146        ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
6147        trace_napi_gro_frags_exit(ret);
6148
6149        return ret;
6150}
6151EXPORT_SYMBOL(napi_gro_frags);
6152
6153/* Compute the checksum from gro_offset and return the folded value
6154 * after adding in any pseudo checksum.
6155 */
6156__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
6157{
6158        __wsum wsum;
6159        __sum16 sum;
6160
6161        wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
6162
6163        /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
6164        sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
6165        /* See comments in __skb_checksum_complete(). */
6166        if (likely(!sum)) {
6167                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
6168                    !skb->csum_complete_sw)
6169                        netdev_rx_csum_fault(skb->dev, skb);
6170        }
6171
6172        NAPI_GRO_CB(skb)->csum = wsum;
6173        NAPI_GRO_CB(skb)->csum_valid = 1;
6174
6175        return sum;
6176}
6177EXPORT_SYMBOL(__skb_gro_checksum_complete);
6178
6179static void net_rps_send_ipi(struct softnet_data *remsd)
6180{
6181#ifdef CONFIG_RPS
6182        while (remsd) {
6183                struct softnet_data *next = remsd->rps_ipi_next;
6184
6185                if (cpu_online(remsd->cpu))
6186                        smp_call_function_single_async(remsd->cpu, &remsd->csd);
6187                remsd = next;
6188        }
6189#endif
6190}
6191
6192/*
6193 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
6194 * Note: called with local irq disabled, but exits with local irq enabled.
6195 */
6196static void net_rps_action_and_irq_enable(struct softnet_data *sd)
6197{
6198#ifdef CONFIG_RPS
6199        struct softnet_data *remsd = sd->rps_ipi_list;
6200
6201        if (remsd) {
6202                sd->rps_ipi_list = NULL;
6203
6204                local_irq_enable();
6205
6206                /* Send pending IPI's to kick RPS processing on remote cpus. */
6207                net_rps_send_ipi(remsd);
6208        } else
6209#endif
6210                local_irq_enable();
6211}
6212
6213static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
6214{
6215#ifdef CONFIG_RPS
6216        return sd->rps_ipi_list != NULL;
6217#else
6218        return false;
6219#endif
6220}
6221
6222static int process_backlog(struct napi_struct *napi, int quota)
6223{
6224        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
6225        bool again = true;
6226        int work = 0;
6227
6228        /* Check if we have pending ipi, its better to send them now,
6229         * not waiting net_rx_action() end.
6230         */
6231        if (sd_has_rps_ipi_waiting(sd)) {
6232                local_irq_disable();
6233                net_rps_action_and_irq_enable(sd);
6234        }
6235
6236        napi->weight = dev_rx_weight;
6237        while (again) {
6238                struct sk_buff *skb;
6239
6240                while ((skb = __skb_dequeue(&sd->process_queue))) {
6241                        rcu_read_lock();
6242                        __netif_receive_skb(skb);
6243                        rcu_read_unlock();
6244                        input_queue_head_incr(sd);
6245                        if (++work >= quota)
6246                                return work;
6247
6248                }
6249
6250                local_irq_disable();
6251                rps_lock(sd);
6252                if (skb_queue_empty(&sd->input_pkt_queue)) {
6253                        /*
6254                         * Inline a custom version of __napi_complete().
6255                         * only current cpu owns and manipulates this napi,
6256                         * and NAPI_STATE_SCHED is the only possible flag set
6257                         * on backlog.
6258                         * We can use a plain write instead of clear_bit(),
6259                         * and we dont need an smp_mb() memory barrier.
6260                         */
6261                        napi->state = 0;
6262                        again = false;
6263                } else {
6264                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
6265                                                   &sd->process_queue);
6266                }
6267                rps_unlock(sd);
6268                local_irq_enable();
6269        }
6270
6271        return work;
6272}
6273
6274/**
6275 * __napi_schedule - schedule for receive
6276 * @n: entry to schedule
6277 *
6278 * The entry's receive function will be scheduled to run.
6279 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6280 */
6281void __napi_schedule(struct napi_struct *n)
6282{
6283        unsigned long flags;
6284
6285        local_irq_save(flags);
6286        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6287        local_irq_restore(flags);
6288}
6289EXPORT_SYMBOL(__napi_schedule);
6290
6291/**
6292 *      napi_schedule_prep - check if napi can be scheduled
6293 *      @n: napi context
6294 *
6295 * Test if NAPI routine is already running, and if not mark
6296 * it as running.  This is used as a condition variable
6297 * insure only one NAPI poll instance runs.  We also make
6298 * sure there is no pending NAPI disable.
6299 */
6300bool napi_schedule_prep(struct napi_struct *n)
6301{
6302        unsigned long val, new;
6303
6304        do {
6305                val = READ_ONCE(n->state);
6306                if (unlikely(val & NAPIF_STATE_DISABLE))
6307                        return false;
6308                new = val | NAPIF_STATE_SCHED;
6309
6310                /* Sets STATE_MISSED bit if STATE_SCHED was already set
6311                 * This was suggested by Alexander Duyck, as compiler
6312                 * emits better code than :
6313                 * if (val & NAPIF_STATE_SCHED)
6314                 *     new |= NAPIF_STATE_MISSED;
6315                 */
6316                new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6317                                                   NAPIF_STATE_MISSED;
6318        } while (cmpxchg(&n->state, val, new) != val);
6319
6320        return !(val & NAPIF_STATE_SCHED);
6321}
6322EXPORT_SYMBOL(napi_schedule_prep);
6323
6324/**
6325 * __napi_schedule_irqoff - schedule for receive
6326 * @n: entry to schedule
6327 *
6328 * Variant of __napi_schedule() assuming hard irqs are masked
6329 */
6330void __napi_schedule_irqoff(struct napi_struct *n)
6331{
6332        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6333}
6334EXPORT_SYMBOL(__napi_schedule_irqoff);
6335
6336bool napi_complete_done(struct napi_struct *n, int work_done)
6337{
6338        unsigned long flags, val, new, timeout = 0;
6339        bool ret = true;
6340
6341        /*
6342         * 1) Don't let napi dequeue from the cpu poll list
6343         *    just in case its running on a different cpu.
6344         * 2) If we are busy polling, do nothing here, we have
6345         *    the guarantee we will be called later.
6346         */
6347        if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6348                                 NAPIF_STATE_IN_BUSY_POLL)))
6349                return false;
6350
6351        if (work_done) {
6352                if (n->gro_bitmask)
6353                        timeout = READ_ONCE(n->dev->gro_flush_timeout);
6354                n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6355        }
6356        if (n->defer_hard_irqs_count > 0) {
6357                n->defer_hard_irqs_count--;
6358                timeout = READ_ONCE(n->dev->gro_flush_timeout);
6359                if (timeout)
6360                        ret = false;
6361        }
6362        if (n->gro_bitmask) {
6363                /* When the NAPI instance uses a timeout and keeps postponing
6364                 * it, we need to bound somehow the time packets are kept in
6365                 * the GRO layer
6366                 */
6367                napi_gro_flush(n, !!timeout);
6368        }
6369
6370        gro_normal_list(n);
6371
6372        if (unlikely(!list_empty(&n->poll_list))) {
6373                /* If n->poll_list is not empty, we need to mask irqs */
6374                local_irq_save(flags);
6375                list_del_init(&n->poll_list);
6376                local_irq_restore(flags);
6377        }
6378
6379        do {
6380                val = READ_ONCE(n->state);
6381
6382                WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6383
6384                new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
6385
6386                /* If STATE_MISSED was set, leave STATE_SCHED set,
6387                 * because we will call napi->poll() one more time.
6388                 * This C code was suggested by Alexander Duyck to help gcc.
6389                 */
6390                new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6391                                                    NAPIF_STATE_SCHED;
6392        } while (cmpxchg(&n->state, val, new) != val);
6393
6394        if (unlikely(val & NAPIF_STATE_MISSED)) {
6395                __napi_schedule(n);
6396                return false;
6397        }
6398
6399        if (timeout)
6400                hrtimer_start(&n->timer, ns_to_ktime(timeout),
6401                              HRTIMER_MODE_REL_PINNED);
6402        return ret;
6403}
6404EXPORT_SYMBOL(napi_complete_done);
6405
6406/* must be called under rcu_read_lock(), as we dont take a reference */
6407static struct napi_struct *napi_by_id(unsigned int napi_id)
6408{
6409        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6410        struct napi_struct *napi;
6411
6412        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6413                if (napi->napi_id == napi_id)
6414                        return napi;
6415
6416        return NULL;
6417}
6418
6419#if defined(CONFIG_NET_RX_BUSY_POLL)
6420
6421#define BUSY_POLL_BUDGET 8
6422
6423static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
6424{
6425        int rc;
6426
6427        /* Busy polling means there is a high chance device driver hard irq
6428         * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6429         * set in napi_schedule_prep().
6430         * Since we are about to call napi->poll() once more, we can safely
6431         * clear NAPI_STATE_MISSED.
6432         *
6433         * Note: x86 could use a single "lock and ..." instruction
6434         * to perform these two clear_bit()
6435         */
6436        clear_bit(NAPI_STATE_MISSED, &napi->state);
6437        clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6438
6439        local_bh_disable();
6440
6441        /* All we really want here is to re-enable device interrupts.
6442         * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6443         */
6444        rc = napi->poll(napi, BUSY_POLL_BUDGET);
6445        /* We can't gro_normal_list() here, because napi->poll() might have
6446         * rearmed the napi (napi_complete_done()) in which case it could
6447         * already be running on another CPU.
6448         */
6449        trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
6450        netpoll_poll_unlock(have_poll_lock);
6451        if (rc == BUSY_POLL_BUDGET) {
6452                /* As the whole budget was spent, we still own the napi so can
6453                 * safely handle the rx_list.
6454                 */
6455                gro_normal_list(napi);
6456                __napi_schedule(napi);
6457        }
6458        local_bh_enable();
6459}
6460
6461void napi_busy_loop(unsigned int napi_id,
6462                    bool (*loop_end)(void *, unsigned long),
6463                    void *loop_end_arg)
6464{
6465        unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6466        int (*napi_poll)(struct napi_struct *napi, int budget);
6467        void *have_poll_lock = NULL;
6468        struct napi_struct *napi;
6469
6470restart:
6471        napi_poll = NULL;
6472
6473        rcu_read_lock();
6474
6475        napi = napi_by_id(napi_id);
6476        if (!napi)
6477                goto out;
6478
6479        preempt_disable();
6480        for (;;) {
6481                int work = 0;
6482
6483                local_bh_disable();
6484                if (!napi_poll) {
6485                        unsigned long val = READ_ONCE(napi->state);
6486
6487                        /* If multiple threads are competing for this napi,
6488                         * we avoid dirtying napi->state as much as we can.
6489                         */
6490                        if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6491                                   NAPIF_STATE_IN_BUSY_POLL))
6492                                goto count;
6493                        if (cmpxchg(&napi->state, val,
6494                                    val | NAPIF_STATE_IN_BUSY_POLL |
6495                                          NAPIF_STATE_SCHED) != val)
6496                                goto count;
6497                        have_poll_lock = netpoll_poll_lock(napi);
6498                        napi_poll = napi->poll;
6499                }
6500                work = napi_poll(napi, BUSY_POLL_BUDGET);
6501                trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
6502                gro_normal_list(napi);
6503count:
6504                if (work > 0)
6505                        __NET_ADD_STATS(dev_net(napi->dev),
6506                                        LINUX_MIB_BUSYPOLLRXPACKETS, work);
6507                local_bh_enable();
6508
6509                if (!loop_end || loop_end(loop_end_arg, start_time))
6510                        break;
6511
6512                if (unlikely(need_resched())) {
6513                        if (napi_poll)
6514                                busy_poll_stop(napi, have_poll_lock);
6515                        preempt_enable();
6516                        rcu_read_unlock();
6517                        cond_resched();
6518                        if (loop_end(loop_end_arg, start_time))
6519                                return;
6520                        goto restart;
6521                }
6522                cpu_relax();
6523        }
6524        if (napi_poll)
6525                busy_poll_stop(napi, have_poll_lock);
6526        preempt_enable();
6527out:
6528        rcu_read_unlock();
6529}
6530EXPORT_SYMBOL(napi_busy_loop);
6531
6532#endif /* CONFIG_NET_RX_BUSY_POLL */
6533
6534static void napi_hash_add(struct napi_struct *napi)
6535{
6536        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
6537            test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
6538                return;
6539
6540        spin_lock(&napi_hash_lock);
6541
6542        /* 0..NR_CPUS range is reserved for sender_cpu use */
6543        do {
6544                if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6545                        napi_gen_id = MIN_NAPI_ID;
6546        } while (napi_by_id(napi_gen_id));
6547        napi->napi_id = napi_gen_id;
6548
6549        hlist_add_head_rcu(&napi->napi_hash_node,
6550                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6551
6552        spin_unlock(&napi_hash_lock);
6553}
6554
6555/* Warning : caller is responsible to make sure rcu grace period
6556 * is respected before freeing memory containing @napi
6557 */
6558bool napi_hash_del(struct napi_struct *napi)
6559{
6560        bool rcu_sync_needed = false;
6561
6562        spin_lock(&napi_hash_lock);
6563
6564        if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
6565                rcu_sync_needed = true;
6566                hlist_del_rcu(&napi->napi_hash_node);
6567        }
6568        spin_unlock(&napi_hash_lock);
6569        return rcu_sync_needed;
6570}
6571EXPORT_SYMBOL_GPL(napi_hash_del);
6572
6573static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6574{
6575        struct napi_struct *napi;
6576
6577        napi = container_of(timer, struct napi_struct, timer);
6578
6579        /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6580         * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6581         */
6582        if (!napi_disable_pending(napi) &&
6583            !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
6584                __napi_schedule_irqoff(napi);
6585
6586        return HRTIMER_NORESTART;
6587}
6588
6589static void init_gro_hash(struct napi_struct *napi)
6590{
6591        int i;
6592
6593        for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6594                INIT_LIST_HEAD(&napi->gro_hash[i].list);
6595                napi->gro_hash[i].count = 0;
6596        }
6597        napi->gro_bitmask = 0;
6598}
6599
6600void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
6601                    int (*poll)(struct napi_struct *, int), int weight)
6602{
6603        INIT_LIST_HEAD(&napi->poll_list);
6604        hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6605        napi->timer.function = napi_watchdog;
6606        init_gro_hash(napi);
6607        napi->skb = NULL;
6608        INIT_LIST_HEAD(&napi->rx_list);
6609        napi->rx_count = 0;
6610        napi->poll = poll;
6611        if (weight > NAPI_POLL_WEIGHT)
6612                netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6613                                weight);
6614        napi->weight = weight;
6615        napi->dev = dev;
6616#ifdef CONFIG_NETPOLL
6617        napi->poll_owner = -1;
6618#endif
6619        set_bit(NAPI_STATE_SCHED, &napi->state);
6620        set_bit(NAPI_STATE_NPSVC, &napi->state);
6621        list_add_rcu(&napi->dev_list, &dev->napi_list);
6622        napi_hash_add(napi);
6623}
6624EXPORT_SYMBOL(netif_napi_add);
6625
6626void napi_disable(struct napi_struct *n)
6627{
6628        might_sleep();
6629        set_bit(NAPI_STATE_DISABLE, &n->state);
6630
6631        while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
6632                msleep(1);
6633        while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
6634                msleep(1);
6635
6636        hrtimer_cancel(&n->timer);
6637
6638        clear_bit(NAPI_STATE_DISABLE, &n->state);
6639}
6640EXPORT_SYMBOL(napi_disable);
6641
6642static void flush_gro_hash(struct napi_struct *napi)
6643{
6644        int i;
6645
6646        for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6647                struct sk_buff *skb, *n;
6648
6649                list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6650                        kfree_skb(skb);
6651                napi->gro_hash[i].count = 0;
6652        }
6653}
6654
6655/* Must be called in process context */
6656void netif_napi_del(struct napi_struct *napi)
6657{
6658        might_sleep();
6659        if (napi_hash_del(napi))
6660                synchronize_net();
6661        list_del_init(&napi->dev_list);
6662        napi_free_frags(napi);
6663
6664        flush_gro_hash(napi);
6665        napi->gro_bitmask = 0;
6666}
6667EXPORT_SYMBOL(netif_napi_del);
6668
6669static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6670{
6671        void *have;
6672        int work, weight;
6673
6674        list_del_init(&n->poll_list);
6675
6676        have = netpoll_poll_lock(n);
6677
6678        weight = n->weight;
6679
6680        /* This NAPI_STATE_SCHED test is for avoiding a race
6681         * with netpoll's poll_napi().  Only the entity which
6682         * obtains the lock and sees NAPI_STATE_SCHED set will
6683         * actually make the ->poll() call.  Therefore we avoid
6684         * accidentally calling ->poll() when NAPI is not scheduled.
6685         */
6686        work = 0;
6687        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6688                work = n->poll(n, weight);
6689                trace_napi_poll(n, work, weight);
6690        }
6691
6692        if (unlikely(work > weight))
6693                pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6694                            n->poll, work, weight);
6695
6696        if (likely(work < weight))
6697                goto out_unlock;
6698
6699        /* Drivers must not modify the NAPI state if they
6700         * consume the entire weight.  In such cases this code
6701         * still "owns" the NAPI instance and therefore can
6702         * move the instance around on the list at-will.
6703         */
6704        if (unlikely(napi_disable_pending(n))) {
6705                napi_complete(n);
6706                goto out_unlock;
6707        }
6708
6709        if (n->gro_bitmask) {
6710                /* flush too old packets
6711                 * If HZ < 1000, flush all packets.
6712                 */
6713                napi_gro_flush(n, HZ >= 1000);
6714        }
6715
6716        gro_normal_list(n);
6717
6718        /* Some drivers may have called napi_schedule
6719         * prior to exhausting their budget.
6720         */
6721        if (unlikely(!list_empty(&n->poll_list))) {
6722                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6723                             n->dev ? n->dev->name : "backlog");
6724                goto out_unlock;
6725        }
6726
6727        list_add_tail(&n->poll_list, repoll);
6728
6729out_unlock:
6730        netpoll_poll_unlock(have);
6731
6732        return work;
6733}
6734
6735static __latent_entropy void net_rx_action(struct softirq_action *h)
6736{
6737        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6738        unsigned long time_limit = jiffies +
6739                usecs_to_jiffies(netdev_budget_usecs);
6740        int budget = netdev_budget;
6741        LIST_HEAD(list);
6742        LIST_HEAD(repoll);
6743
6744        local_irq_disable();
6745        list_splice_init(&sd->poll_list, &list);
6746        local_irq_enable();
6747
6748        for (;;) {
6749                struct napi_struct *n;
6750
6751                if (list_empty(&list)) {
6752                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
6753                                goto out;
6754                        break;
6755                }
6756
6757                n = list_first_entry(&list, struct napi_struct, poll_list);
6758                budget -= napi_poll(n, &repoll);
6759
6760                /* If softirq window is exhausted then punt.
6761                 * Allow this to run for 2 jiffies since which will allow
6762                 * an average latency of 1.5/HZ.
6763                 */
6764                if (unlikely(budget <= 0 ||
6765                             time_after_eq(jiffies, time_limit))) {
6766                        sd->time_squeeze++;
6767                        break;
6768                }
6769        }
6770
6771        local_irq_disable();
6772
6773        list_splice_tail_init(&sd->poll_list, &list);
6774        list_splice_tail(&repoll, &list);
6775        list_splice(&list, &sd->poll_list);
6776        if (!list_empty(&sd->poll_list))
6777                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
6778
6779        net_rps_action_and_irq_enable(sd);
6780out:
6781        __kfree_skb_flush();
6782}
6783
6784struct netdev_adjacent {
6785        struct net_device *dev;
6786
6787        /* upper master flag, there can only be one master device per list */
6788        bool master;
6789
6790        /* lookup ignore flag */
6791        bool ignore;
6792
6793        /* counter for the number of times this device was added to us */
6794        u16 ref_nr;
6795
6796        /* private field for the users */
6797        void *private;
6798
6799        struct list_head list;
6800        struct rcu_head rcu;
6801};
6802
6803static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6804                                                 struct list_head *adj_list)
6805{
6806        struct netdev_adjacent *adj;
6807
6808        list_for_each_entry(adj, adj_list, list) {
6809                if (adj->dev == adj_dev)
6810                        return adj;
6811        }
6812        return NULL;
6813}
6814
6815static int ____netdev_has_upper_dev(struct net_device *upper_dev,
6816                                    struct netdev_nested_priv *priv)
6817{
6818        struct net_device *dev = (struct net_device *)priv->data;
6819
6820        return upper_dev == dev;
6821}
6822
6823/**
6824 * netdev_has_upper_dev - Check if device is linked to an upper device
6825 * @dev: device
6826 * @upper_dev: upper device to check
6827 *
6828 * Find out if a device is linked to specified upper device and return true
6829 * in case it is. Note that this checks only immediate upper device,
6830 * not through a complete stack of devices. The caller must hold the RTNL lock.
6831 */
6832bool netdev_has_upper_dev(struct net_device *dev,
6833                          struct net_device *upper_dev)
6834{
6835        struct netdev_nested_priv priv = {
6836                .data = (void *)upper_dev,
6837        };
6838
6839        ASSERT_RTNL();
6840
6841        return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6842                                             &priv);
6843}
6844EXPORT_SYMBOL(netdev_has_upper_dev);
6845
6846/**
6847 * netdev_has_upper_dev_all - Check if device is linked to an upper device
6848 * @dev: device
6849 * @upper_dev: upper device to check
6850 *
6851 * Find out if a device is linked to specified upper device and return true
6852 * in case it is. Note that this checks the entire upper device chain.
6853 * The caller must hold rcu lock.
6854 */
6855
6856bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6857                                  struct net_device *upper_dev)
6858{
6859        struct netdev_nested_priv priv = {
6860                .data = (void *)upper_dev,
6861        };
6862
6863        return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6864                                               &priv);
6865}
6866EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6867
6868/**
6869 * netdev_has_any_upper_dev - Check if device is linked to some device
6870 * @dev: device
6871 *
6872 * Find out if a device is linked to an upper device and return true in case
6873 * it is. The caller must hold the RTNL lock.
6874 */
6875bool netdev_has_any_upper_dev(struct net_device *dev)
6876{
6877        ASSERT_RTNL();
6878
6879        return !list_empty(&dev->adj_list.upper);
6880}
6881EXPORT_SYMBOL(netdev_has_any_upper_dev);
6882
6883/**
6884 * netdev_master_upper_dev_get - Get master upper device
6885 * @dev: device
6886 *
6887 * Find a master upper device and return pointer to it or NULL in case
6888 * it's not there. The caller must hold the RTNL lock.
6889 */
6890struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6891{
6892        struct netdev_adjacent *upper;
6893
6894        ASSERT_RTNL();
6895
6896        if (list_empty(&dev->adj_list.upper))
6897                return NULL;
6898
6899        upper = list_first_entry(&dev->adj_list.upper,
6900                                 struct netdev_adjacent, list);
6901        if (likely(upper->master))
6902                return upper->dev;
6903        return NULL;
6904}
6905EXPORT_SYMBOL(netdev_master_upper_dev_get);
6906
6907static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
6908{
6909        struct netdev_adjacent *upper;
6910
6911        ASSERT_RTNL();
6912
6913        if (list_empty(&dev->adj_list.upper))
6914                return NULL;
6915
6916        upper = list_first_entry(&dev->adj_list.upper,
6917                                 struct netdev_adjacent, list);
6918        if (likely(upper->master) && !upper->ignore)
6919                return upper->dev;
6920        return NULL;
6921}
6922
6923/**
6924 * netdev_has_any_lower_dev - Check if device is linked to some device
6925 * @dev: device
6926 *
6927 * Find out if a device is linked to a lower device and return true in case
6928 * it is. The caller must hold the RTNL lock.
6929 */
6930static bool netdev_has_any_lower_dev(struct net_device *dev)
6931{
6932        ASSERT_RTNL();
6933
6934        return !list_empty(&dev->adj_list.lower);
6935}
6936
6937void *netdev_adjacent_get_private(struct list_head *adj_list)
6938{
6939        struct netdev_adjacent *adj;
6940
6941        adj = list_entry(adj_list, struct netdev_adjacent, list);
6942
6943        return adj->private;
6944}
6945EXPORT_SYMBOL(netdev_adjacent_get_private);
6946
6947/**
6948 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
6949 * @dev: device
6950 * @iter: list_head ** of the current position
6951 *
6952 * Gets the next device from the dev's upper list, starting from iter
6953 * position. The caller must hold RCU read lock.
6954 */
6955struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
6956                                                 struct list_head **iter)
6957{
6958        struct netdev_adjacent *upper;
6959
6960        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6961
6962        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6963
6964        if (&upper->list == &dev->adj_list.upper)
6965                return NULL;
6966
6967        *iter = &upper->list;
6968
6969        return upper->dev;
6970}
6971EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
6972
6973static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
6974                                                  struct list_head **iter,
6975                                                  bool *ignore)
6976{
6977        struct netdev_adjacent *upper;
6978
6979        upper = list_entry((*iter)->next, struct netdev_adjacent, list);
6980
6981        if (&upper->list == &dev->adj_list.upper)
6982                return NULL;
6983
6984        *iter = &upper->list;
6985        *ignore = upper->ignore;
6986
6987        return upper->dev;
6988}
6989
6990static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
6991                                                    struct list_head **iter)
6992{
6993        struct netdev_adjacent *upper;
6994
6995        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6996
6997        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6998
6999        if (&upper->list == &dev->adj_list.upper)
7000                return NULL;

7001
7002        *iter = &upper->list;
7003
7004        return upper->dev;
7005}
7006
7007static int __netdev_walk_all_upper_dev(struct net_device *dev,
7008                                       int (*fn)(struct net_device *dev,
7009                                         struct netdev_nested_priv *priv),
7010                                       struct netdev_nested_priv *priv)
7011{
7012        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7013        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7014        int ret, cur = 0;
7015        bool ignore;
7016
7017        now = dev;
7018        iter = &dev->adj_list.upper;
7019
7020        while (1) {
7021                if (now != dev) {
7022                        ret = fn(now, priv);
7023                        if (ret)
7024                                return ret;
7025                }
7026
7027                next = NULL;
7028                while (1) {
7029                        udev = __netdev_next_upper_dev(now, &iter, &ignore);
7030                        if (!udev)
7031                                break;
7032                        if (ignore)
7033                                continue;
7034
7035                        next = udev;
7036                        niter = &udev->adj_list.upper;
7037                        dev_stack[cur] = now;
7038                        iter_stack[cur++] = iter;
7039                        break;
7040                }
7041
7042                if (!next) {
7043                        if (!cur)
7044                                return 0;
7045                        next = dev_stack[--cur];
7046                        niter = iter_stack[cur];
7047                }
7048
7049                now = next;
7050                iter = niter;
7051        }
7052
7053        return 0;
7054}
7055
7056int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
7057                                  int (*fn)(struct net_device *dev,
7058                                            struct netdev_nested_priv *priv),
7059                                  struct netdev_nested_priv *priv)
7060{
7061        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7062        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7063        int ret, cur = 0;
7064
7065        now = dev;
7066        iter = &dev->adj_list.upper;
7067
7068        while (1) {
7069                if (now != dev) {
7070                        ret = fn(now, priv);
7071                        if (ret)
7072                                return ret;
7073                }
7074
7075                next = NULL;
7076                while (1) {
7077                        udev = netdev_next_upper_dev_rcu(now, &iter);
7078                        if (!udev)
7079                                break;
7080
7081                        next = udev;
7082                        niter = &udev->adj_list.upper;
7083                        dev_stack[cur] = now;
7084                        iter_stack[cur++] = iter;
7085                        break;
7086                }
7087
7088                if (!next) {
7089                        if (!cur)
7090                                return 0;
7091                        next = dev_stack[--cur];
7092                        niter = iter_stack[cur];
7093                }
7094
7095                now = next;
7096                iter = niter;
7097        }
7098
7099        return 0;
7100}
7101EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7102
7103static bool __netdev_has_upper_dev(struct net_device *dev,
7104                                   struct net_device *upper_dev)
7105{
7106        struct netdev_nested_priv priv = {
7107                .flags = 0,
7108                .data = (void *)upper_dev,
7109        };
7110
7111        ASSERT_RTNL();
7112
7113        return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7114                                           &priv);
7115}
7116
7117/**
7118 * netdev_lower_get_next_private - Get the next ->private from the
7119 *                                 lower neighbour list
7120 * @dev: device
7121 * @iter: list_head ** of the current position
7122 *
7123 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7124 * list, starting from iter position. The caller must hold either hold the
7125 * RTNL lock or its own locking that guarantees that the neighbour lower
7126 * list will remain unchanged.
7127 */
7128void *netdev_lower_get_next_private(struct net_device *dev,
7129                                    struct list_head **iter)
7130{
7131        struct netdev_adjacent *lower;
7132
7133        lower = list_entry(*iter, struct netdev_adjacent, list);
7134
7135        if (&lower->list == &dev->adj_list.lower)
7136                return NULL;
7137
7138        *iter = lower->list.next;
7139
7140        return lower->private;
7141}
7142EXPORT_SYMBOL(netdev_lower_get_next_private);
7143
7144/**
7145 * netdev_lower_get_next_private_rcu - Get the next ->private from the
7146 *                                     lower neighbour list, RCU
7147 *                                     variant
7148 * @dev: device
7149 * @iter: list_head ** of the current position
7150 *
7151 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7152 * list, starting from iter position. The caller must hold RCU read lock.
7153 */
7154void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7155                                        struct list_head **iter)
7156{
7157        struct netdev_adjacent *lower;
7158
7159        WARN_ON_ONCE(!rcu_read_lock_held());
7160
7161        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7162
7163        if (&lower->list == &dev->adj_list.lower)
7164                return NULL;
7165
7166        *iter = &lower->list;
7167
7168        return lower->private;
7169}
7170EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7171
7172/**
7173 * netdev_lower_get_next - Get the next device from the lower neighbour
7174 *                         list
7175 * @dev: device
7176 * @iter: list_head ** of the current position
7177 *
7178 * Gets the next netdev_adjacent from the dev's lower neighbour
7179 * list, starting from iter position. The caller must hold RTNL lock or
7180 * its own locking that guarantees that the neighbour lower
7181 * list will remain unchanged.
7182 */
7183void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7184{
7185        struct netdev_adjacent *lower;
7186
7187        lower = list_entry(*iter, struct netdev_adjacent, list);
7188
7189        if (&lower->list == &dev->adj_list.lower)
7190                return NULL;
7191
7192        *iter = lower->list.next;
7193
7194        return lower->dev;
7195}
7196EXPORT_SYMBOL(netdev_lower_get_next);
7197
7198static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7199                                                struct list_head **iter)
7200{
7201        struct netdev_adjacent *lower;
7202
7203        lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7204
7205        if (&lower->list == &dev->adj_list.lower)
7206                return NULL;
7207
7208        *iter = &lower->list;
7209
7210        return lower->dev;
7211}
7212
7213static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7214                                                  struct list_head **iter,
7215                                                  bool *ignore)
7216{
7217        struct netdev_adjacent *lower;
7218
7219        lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7220
7221        if (&lower->list == &dev->adj_list.lower)
7222                return NULL;
7223
7224        *iter = &lower->list;
7225        *ignore = lower->ignore;
7226
7227        return lower->dev;
7228}
7229
7230int netdev_walk_all_lower_dev(struct net_device *dev,
7231                              int (*fn)(struct net_device *dev,
7232                                        struct netdev_nested_priv *priv),
7233                              struct netdev_nested_priv *priv)
7234{
7235        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7236        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7237        int ret, cur = 0;
7238
7239        now = dev;
7240        iter = &dev->adj_list.lower;
7241
7242        while (1) {
7243                if (now != dev) {
7244                        ret = fn(now, priv);
7245                        if (ret)
7246                                return ret;
7247                }
7248
7249                next = NULL;
7250                while (1) {
7251                        ldev = netdev_next_lower_dev(now, &iter);
7252                        if (!ldev)
7253                                break;
7254
7255                        next = ldev;
7256                        niter = &ldev->adj_list.lower;
7257                        dev_stack[cur] = now;
7258                        iter_stack[cur++] = iter;
7259                        break;
7260                }
7261
7262                if (!next) {
7263                        if (!cur)
7264                                return 0;
7265                        next = dev_stack[--cur];
7266                        niter = iter_stack[cur];
7267                }
7268
7269                now = next;
7270                iter = niter;
7271        }
7272
7273        return 0;
7274}
7275EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7276
7277static int __netdev_walk_all_lower_dev(struct net_device *dev,
7278                                       int (*fn)(struct net_device *dev,
7279                                         struct netdev_nested_priv *priv),
7280                                       struct netdev_nested_priv *priv)
7281{
7282        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7283        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7284        int ret, cur = 0;
7285        bool ignore;
7286
7287        now = dev;
7288        iter = &dev->adj_list.lower;
7289
7290        while (1) {
7291                if (now != dev) {
7292                        ret = fn(now, priv);
7293                        if (ret)
7294                                return ret;
7295                }
7296
7297                next = NULL;
7298                while (1) {
7299                        ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7300                        if (!ldev)
7301                                break;
7302                        if (ignore)
7303                                continue;
7304
7305                        next = ldev;
7306                        niter = &ldev->adj_list.lower;
7307                        dev_stack[cur] = now;
7308                        iter_stack[cur++] = iter;
7309                        break;
7310                }
7311
7312                if (!next) {
7313                        if (!cur)
7314                                return 0;
7315                        next = dev_stack[--cur];
7316                        niter = iter_stack[cur];
7317                }
7318
7319                now = next;
7320                iter = niter;
7321        }
7322
7323        return 0;
7324}
7325
7326struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7327                                             struct list_head **iter)
7328{
7329        struct netdev_adjacent *lower;
7330
7331        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7332        if (&lower->list == &dev->adj_list.lower)
7333                return NULL;
7334
7335        *iter = &lower->list;
7336
7337        return lower->dev;
7338}
7339EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7340
7341static u8 __netdev_upper_depth(struct net_device *dev)
7342{
7343        struct net_device *udev;
7344        struct list_head *iter;
7345        u8 max_depth = 0;
7346        bool ignore;
7347
7348        for (iter = &dev->adj_list.upper,
7349             udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7350             udev;
7351             udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7352                if (ignore)
7353                        continue;
7354                if (max_depth < udev->upper_level)
7355                        max_depth = udev->upper_level;
7356        }
7357
7358        return max_depth;
7359}
7360
7361static u8 __netdev_lower_depth(struct net_device *dev)
7362{
7363        struct net_device *ldev;
7364        struct list_head *iter;
7365        u8 max_depth = 0;
7366        bool ignore;
7367
7368        for (iter = &dev->adj_list.lower,
7369             ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7370             ldev;
7371             ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7372                if (ignore)
7373                        continue;
7374                if (max_depth < ldev->lower_level)
7375                        max_depth = ldev->lower_level;
7376        }
7377
7378        return max_depth;
7379}
7380
7381static int __netdev_update_upper_level(struct net_device *dev,
7382                                       struct netdev_nested_priv *__unused)
7383{
7384        dev->upper_level = __netdev_upper_depth(dev) + 1;
7385        return 0;
7386}
7387
7388static int __netdev_update_lower_level(struct net_device *dev,
7389                                       struct netdev_nested_priv *priv)
7390{
7391        dev->lower_level = __netdev_lower_depth(dev) + 1;
7392
7393#ifdef CONFIG_LOCKDEP
7394        if (!priv)
7395                return 0;
7396
7397        if (priv->flags & NESTED_SYNC_IMM)
7398                dev->nested_level = dev->lower_level - 1;
7399        if (priv->flags & NESTED_SYNC_TODO)
7400                net_unlink_todo(dev);
7401#endif
7402        return 0;
7403}
7404
7405int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7406                                  int (*fn)(struct net_device *dev,
7407                                            struct netdev_nested_priv *priv),
7408                                  struct netdev_nested_priv *priv)
7409{
7410        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7411        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7412        int ret, cur = 0;
7413
7414        now = dev;
7415        iter = &dev->adj_list.lower;
7416
7417        while (1) {
7418                if (now != dev) {
7419                        ret = fn(now, priv);
7420                        if (ret)
7421                                return ret;
7422                }
7423
7424                next = NULL;
7425                while (1) {
7426                        ldev = netdev_next_lower_dev_rcu(now, &iter);
7427                        if (!ldev)
7428                                break;
7429
7430                        next = ldev;
7431                        niter = &ldev->adj_list.lower;
7432                        dev_stack[cur] = now;
7433                        iter_stack[cur++] = iter;
7434                        break;
7435                }
7436
7437                if (!next) {
7438                        if (!cur)
7439                                return 0;
7440                        next = dev_stack[--cur];
7441                        niter = iter_stack[cur];
7442                }
7443
7444                now = next;
7445                iter = niter;
7446        }
7447
7448        return 0;
7449}
7450EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7451
7452/**
7453 * netdev_lower_get_first_private_rcu - Get the first ->private from the
7454 *                                     lower neighbour list, RCU
7455 *                                     variant
7456 * @dev: device
7457 *
7458 * Gets the first netdev_adjacent->private from the dev's lower neighbour
7459 * list. The caller must hold RCU read lock.
7460 */
7461void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7462{
7463        struct netdev_adjacent *lower;
7464
7465        lower = list_first_or_null_rcu(&dev->adj_list.lower,
7466                        struct netdev_adjacent, list);
7467        if (lower)
7468                return lower->private;
7469        return NULL;
7470}
7471EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7472
7473/**
7474 * netdev_master_upper_dev_get_rcu - Get master upper device
7475 * @dev: device
7476 *
7477 * Find a master upper device and return pointer to it or NULL in case
7478 * it's not there. The caller must hold the RCU read lock.
7479 */
7480struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7481{
7482        struct netdev_adjacent *upper;
7483
7484        upper = list_first_or_null_rcu(&dev->adj_list.upper,
7485                                       struct netdev_adjacent, list);
7486        if (upper && likely(upper->master))
7487                return upper->dev;
7488        return NULL;
7489}
7490EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7491
7492static int netdev_adjacent_sysfs_add(struct net_device *dev,
7493                              struct net_device *adj_dev,
7494                              struct list_head *dev_list)
7495{
7496        char linkname[IFNAMSIZ+7];
7497
7498        sprintf(linkname, dev_list == &dev->adj_list.upper ?
7499                "upper_%s" : "lower_%s", adj_dev->name);
7500        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7501                                 linkname);
7502}
7503static void netdev_adjacent_sysfs_del(struct net_device *dev,
7504                               char *name,
7505                               struct list_head *dev_list)
7506{
7507        char linkname[IFNAMSIZ+7];
7508
7509        sprintf(linkname, dev_list == &dev->adj_list.upper ?
7510                "upper_%s" : "lower_%s", name);
7511        sysfs_remove_link(&(dev->dev.kobj), linkname);
7512}
7513
7514static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7515                                                 struct net_device *adj_dev,
7516                                                 struct list_head *dev_list)
7517{
7518        return (dev_list == &dev->adj_list.upper ||
7519                dev_list == &dev->adj_list.lower) &&
7520                net_eq(dev_net(dev), dev_net(adj_dev));
7521}
7522
7523static int __netdev_adjacent_dev_insert(struct net_device *dev,
7524                                        struct net_device *adj_dev,
7525                                        struct list_head *dev_list,
7526                                        void *private, bool master)
7527{
7528        struct netdev_adjacent *adj;
7529        int ret;
7530
7531        adj = __netdev_find_adj(adj_dev, dev_list);
7532
7533        if (adj) {
7534                adj->ref_nr += 1;
7535                pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7536                         dev->name, adj_dev->name, adj->ref_nr);
7537
7538                return 0;
7539        }
7540
7541        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7542        if (!adj)
7543                return -ENOMEM;
7544
7545        adj->dev = adj_dev;
7546        adj->master = master;
7547        adj->ref_nr = 1;
7548        adj->private = private;
7549        adj->ignore = false;
7550        dev_hold(adj_dev);
7551
7552        pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7553                 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7554
7555        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7556                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7557                if (ret)
7558                        goto free_adj;
7559        }
7560
7561        /* Ensure that master link is always the first item in list. */
7562        if (master) {
7563                ret = sysfs_create_link(&(dev->dev.kobj),
7564                                        &(adj_dev->dev.kobj), "master");
7565                if (ret)
7566                        goto remove_symlinks;
7567
7568                list_add_rcu(&adj->list, dev_list);
7569        } else {
7570                list_add_tail_rcu(&adj->list, dev_list);
7571        }
7572
7573        return 0;
7574
7575remove_symlinks:
7576        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7577                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7578free_adj:
7579        kfree(adj);
7580        dev_put(adj_dev);
7581
7582        return ret;
7583}
7584
7585static void __netdev_adjacent_dev_remove(struct net_device *dev,
7586                                         struct net_device *adj_dev,
7587                                         u16 ref_nr,
7588                                         struct list_head *dev_list)
7589{
7590        struct netdev_adjacent *adj;
7591
7592        pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7593                 dev->name, adj_dev->name, ref_nr);
7594
7595        adj = __netdev_find_adj(adj_dev, dev_list);
7596
7597        if (!adj) {
7598                pr_err("Adjacency does not exist for device %s from %s\n",
7599                       dev->name, adj_dev->name);
7600                WARN_ON(1);
7601                return;
7602        }
7603
7604        if (adj->ref_nr > ref_nr) {
7605                pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7606                         dev->name, adj_dev->name, ref_nr,
7607                         adj->ref_nr - ref_nr);
7608                adj->ref_nr -= ref_nr;
7609                return;
7610        }
7611
7612        if (adj->master)
7613                sysfs_remove_link(&(dev->dev.kobj), "master");
7614
7615        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7616                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7617
7618        list_del_rcu(&adj->list);
7619        pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7620                 adj_dev->name, dev->name, adj_dev->name);
7621        dev_put(adj_dev);
7622        kfree_rcu(adj, rcu);
7623}
7624
7625static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7626                                            struct net_device *upper_dev,
7627                                            struct list_head *up_list,
7628                                            struct list_head *down_list,
7629                                            void *private, bool master)
7630{
7631        int ret;
7632
7633        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7634                                           private, master);
7635        if (ret)
7636                return ret;
7637
7638        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7639                                           private, false);
7640        if (ret) {
7641                __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7642                return ret;
7643        }
7644
7645        return 0;
7646}
7647
7648static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7649                                               struct net_device *upper_dev,
7650                                               u16 ref_nr,
7651                                               struct list_head *up_list,
7652                                               struct list_head *down_list)
7653{
7654        __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7655        __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7656}
7657
7658static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7659                                                struct net_device *upper_dev,
7660                                                void *private, bool master)
7661{
7662        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7663                                                &dev->adj_list.upper,
7664                                                &upper_dev->adj_list.lower,
7665                                                private, master);
7666}
7667
7668static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7669                                                   struct net_device *upper_dev)
7670{
7671        __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7672                                           &dev->adj_list.upper,
7673                                           &upper_dev->adj_list.lower);
7674}
7675
7676static int __netdev_upper_dev_link(struct net_device *dev,
7677                                   struct net_device *upper_dev, bool master,
7678                                   void *upper_priv, void *upper_info,
7679                                   struct netdev_nested_priv *priv,
7680                                   struct netlink_ext_ack *extack)
7681{
7682        struct netdev_notifier_changeupper_info changeupper_info = {
7683                .info = {
7684                        .dev = dev,
7685                        .extack = extack,
7686                },
7687                .upper_dev = upper_dev,
7688                .master = master,
7689                .linking = true,
7690                .upper_info = upper_info,
7691        };
7692        struct net_device *master_dev;
7693        int ret = 0;
7694
7695        ASSERT_RTNL();
7696
7697        if (dev == upper_dev)
7698                return -EBUSY;
7699
7700        /* To prevent loops, check if dev is not upper device to upper_dev. */
7701        if (__netdev_has_upper_dev(upper_dev, dev))
7702                return -EBUSY;
7703
7704        if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7705                return -EMLINK;
7706
7707        if (!master) {
7708                if (__netdev_has_upper_dev(dev, upper_dev))
7709                        return -EEXIST;
7710        } else {
7711                master_dev = __netdev_master_upper_dev_get(dev);
7712                if (master_dev)
7713                        return master_dev == upper_dev ? -EEXIST : -EBUSY;
7714        }
7715
7716        ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7717                                            &changeupper_info.info);
7718        ret = notifier_to_errno(ret);
7719        if (ret)
7720                return ret;
7721
7722        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7723                                                   master);
7724        if (ret)
7725                return ret;
7726
7727        ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7728                                            &changeupper_info.info);
7729        ret = notifier_to_errno(ret);
7730        if (ret)
7731                goto rollback;
7732
7733        __netdev_update_upper_level(dev, NULL);
7734        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7735
7736        __netdev_update_lower_level(upper_dev, priv);
7737        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7738                                    priv);
7739
7740        return 0;
7741
7742rollback:
7743        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7744
7745        return ret;
7746}
7747
7748/**
7749 * netdev_upper_dev_link - Add a link to the upper device
7750 * @dev: device
7751 * @upper_dev: new upper device
7752 * @extack: netlink extended ack
7753 *
7754 * Adds a link to device which is upper to this one. The caller must hold
7755 * the RTNL lock. On a failure a negative errno code is returned.
7756 * On success the reference counts are adjusted and the function
7757 * returns zero.
7758 */
7759int netdev_upper_dev_link(struct net_device *dev,
7760                          struct net_device *upper_dev,
7761                          struct netlink_ext_ack *extack)
7762{
7763        struct netdev_nested_priv priv = {
7764                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7765                .data = NULL,
7766        };
7767
7768        return __netdev_upper_dev_link(dev, upper_dev, false,
7769                                       NULL, NULL, &priv, extack);
7770}
7771EXPORT_SYMBOL(netdev_upper_dev_link);
7772
7773/**
7774 * netdev_master_upper_dev_link - Add a master link to the upper device
7775 * @dev: device
7776 * @upper_dev: new upper device
7777 * @upper_priv: upper device private
7778 * @upper_info: upper info to be passed down via notifier
7779 * @extack: netlink extended ack
7780 *
7781 * Adds a link to device which is upper to this one. In this case, only
7782 * one master upper device can be linked, although other non-master devices
7783 * might be linked as well. The caller must hold the RTNL lock.
7784 * On a failure a negative errno code is returned. On success the reference
7785 * counts are adjusted and the function returns zero.
7786 */
7787int netdev_master_upper_dev_link(struct net_device *dev,
7788                                 struct net_device *upper_dev,
7789                                 void *upper_priv, void *upper_info,
7790                                 struct netlink_ext_ack *extack)
7791{
7792        struct netdev_nested_priv priv = {
7793                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7794                .data = NULL,
7795        };
7796
7797        return __netdev_upper_dev_link(dev, upper_dev, true,
7798                                       upper_priv, upper_info, &priv, extack);
7799}
7800EXPORT_SYMBOL(netdev_master_upper_dev_link);
7801
7802static void __netdev_upper_dev_unlink(struct net_device *dev,
7803                                      struct net_device *upper_dev,
7804                                      struct netdev_nested_priv *priv)
7805{
7806        struct netdev_notifier_changeupper_info changeupper_info = {
7807                .info = {
7808                        .dev = dev,
7809                },
7810                .upper_dev = upper_dev,
7811                .linking = false,
7812        };
7813
7814        ASSERT_RTNL();
7815
7816        changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7817
7818        call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7819                                      &changeupper_info.info);
7820
7821        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7822
7823        call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7824                                      &changeupper_info.info);
7825
7826        __netdev_update_upper_level(dev, NULL);
7827        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7828
7829        __netdev_update_lower_level(upper_dev, priv);
7830        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7831                                    priv);
7832}
7833
7834/**
7835 * netdev_upper_dev_unlink - Removes a link to upper device
7836 * @dev: device
7837 * @upper_dev: new upper device
7838 *
7839 * Removes a link to device which is upper to this one. The caller must hold
7840 * the RTNL lock.
7841 */
7842void netdev_upper_dev_unlink(struct net_device *dev,
7843                             struct net_device *upper_dev)
7844{
7845        struct netdev_nested_priv priv = {
7846                .flags = NESTED_SYNC_TODO,
7847                .data = NULL,
7848        };
7849
7850        __netdev_upper_dev_unlink(dev, upper_dev, &priv);
7851}
7852EXPORT_SYMBOL(netdev_upper_dev_unlink);
7853
7854static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
7855                                      struct net_device *lower_dev,
7856                                      bool val)
7857{
7858        struct netdev_adjacent *adj;
7859
7860        adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
7861        if (adj)
7862                adj->ignore = val;
7863
7864        adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
7865        if (adj)
7866                adj->ignore = val;
7867}
7868
7869static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
7870                                        struct net_device *lower_dev)
7871{
7872        __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
7873}
7874
7875static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
7876                                       struct net_device *lower_dev)
7877{
7878        __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
7879}
7880
7881int netdev_adjacent_change_prepare(struct net_device *old_dev,
7882                                   struct net_device *new_dev,
7883                                   struct net_device *dev,
7884                                   struct netlink_ext_ack *extack)
7885{
7886        struct netdev_nested_priv priv = {
7887                .flags = 0,
7888                .data = NULL,
7889        };
7890        int err;
7891
7892        if (!new_dev)
7893                return 0;
7894
7895        if (old_dev && new_dev != old_dev)
7896                netdev_adjacent_dev_disable(dev, old_dev);
7897        err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
7898                                      extack);
7899        if (err) {
7900                if (old_dev && new_dev != old_dev)
7901                        netdev_adjacent_dev_enable(dev, old_dev);
7902                return err;
7903        }
7904
7905        return 0;
7906}
7907EXPORT_SYMBOL(netdev_adjacent_change_prepare);
7908
7909void netdev_adjacent_change_commit(struct net_device *old_dev,
7910                                   struct net_device *new_dev,
7911                                   struct net_device *dev)
7912{
7913        struct netdev_nested_priv priv = {
7914                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7915                .data = NULL,
7916        };
7917
7918        if (!new_dev || !old_dev)
7919                return;
7920
7921        if (new_dev == old_dev)
7922                return;
7923
7924        netdev_adjacent_dev_enable(dev, old_dev);
7925        __netdev_upper_dev_unlink(old_dev, dev, &priv);
7926}
7927EXPORT_SYMBOL(netdev_adjacent_change_commit);
7928
7929void netdev_adjacent_change_abort(struct net_device *old_dev,
7930                                  struct net_device *new_dev,
7931                                  struct net_device *dev)
7932{
7933        struct netdev_nested_priv priv = {
7934                .flags = 0,
7935                .data = NULL,
7936        };
7937
7938        if (!new_dev)
7939                return;
7940
7941        if (old_dev && new_dev != old_dev)
7942                netdev_adjacent_dev_enable(dev, old_dev);
7943
7944        __netdev_upper_dev_unlink(new_dev, dev, &priv);
7945}
7946EXPORT_SYMBOL(netdev_adjacent_change_abort);
7947
7948/**
7949 * netdev_bonding_info_change - Dispatch event about slave change
7950 * @dev: device
7951 * @bonding_info: info to dispatch
7952 *
7953 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
7954 * The caller must hold the RTNL lock.
7955 */
7956void netdev_bonding_info_change(struct net_device *dev,
7957                                struct netdev_bonding_info *bonding_info)
7958{
7959        struct netdev_notifier_bonding_info info = {
7960                .info.dev = dev,
7961        };
7962
7963        memcpy(&info.bonding_info, bonding_info,
7964               sizeof(struct netdev_bonding_info));
7965        call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
7966                                      &info.info);
7967}
7968EXPORT_SYMBOL(netdev_bonding_info_change);
7969
7970/**
7971 * netdev_get_xmit_slave - Get the xmit slave of master device
7972 * @dev: device
7973 * @skb: The packet
7974 * @all_slaves: assume all the slaves are active
7975 *
7976 * The reference counters are not incremented so the caller must be
7977 * careful with locks. The caller must hold RCU lock.
7978 * %NULL is returned if no slave is found.
7979 */
7980
7981struct net_device *netdev_get_xmit_slave(struct net_device *dev,
7982                                         struct sk_buff *skb,
7983                                         bool all_slaves)
7984{
7985        const struct net_device_ops *ops = dev->netdev_ops;
7986
7987        if (!ops->ndo_get_xmit_slave)
7988                return NULL;
7989        return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
7990}
7991EXPORT_SYMBOL(netdev_get_xmit_slave);
7992
7993static void netdev_adjacent_add_links(struct net_device *dev)
7994{
7995        struct netdev_adjacent *iter;
7996
7997        struct net *net = dev_net(dev);
7998
7999        list_for_each_entry(iter, &dev->adj_list.upper, list) {
8000                if (!net_eq(net, dev_net(iter->dev)))

8001                        continue;
8002                netdev_adjacent_sysfs_add(iter->dev, dev,
8003                                          &iter->dev->adj_list.lower);
8004                netdev_adjacent_sysfs_add(dev, iter->dev,
8005                                          &dev->adj_list.upper);
8006        }
8007
8008        list_for_each_entry(iter, &dev->adj_list.lower, list) {
8009                if (!net_eq(net, dev_net(iter->dev)))
8010                        continue;
8011                netdev_adjacent_sysfs_add(iter->dev, dev,
8012                                          &iter->dev->adj_list.upper);
8013                netdev_adjacent_sysfs_add(dev, iter->dev,
8014                                          &dev->adj_list.lower);
8015        }
8016}
8017
8018static void netdev_adjacent_del_links(struct net_device *dev)
8019{
8020        struct netdev_adjacent *iter;
8021
8022        struct net *net = dev_net(dev);
8023
8024        list_for_each_entry(iter, &dev->adj_list.upper, list) {
8025                if (!net_eq(net, dev_net(iter->dev)))
8026                        continue;
8027                netdev_adjacent_sysfs_del(iter->dev, dev->name,
8028                                          &iter->dev->adj_list.lower);
8029                netdev_adjacent_sysfs_del(dev, iter->dev->name,
8030                                          &dev->adj_list.upper);
8031        }
8032
8033        list_for_each_entry(iter, &dev->adj_list.lower, list) {
8034                if (!net_eq(net, dev_net(iter->dev)))
8035                        continue;
8036                netdev_adjacent_sysfs_del(iter->dev, dev->name,
8037                                          &iter->dev->adj_list.upper);
8038                netdev_adjacent_sysfs_del(dev, iter->dev->name,
8039                                          &dev->adj_list.lower);
8040        }
8041}
8042
8043void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
8044{
8045        struct netdev_adjacent *iter;
8046
8047        struct net *net = dev_net(dev);
8048
8049        list_for_each_entry(iter, &dev->adj_list.upper, list) {
8050                if (!net_eq(net, dev_net(iter->dev)))
8051                        continue;
8052                netdev_adjacent_sysfs_del(iter->dev, oldname,
8053                                          &iter->dev->adj_list.lower);
8054                netdev_adjacent_sysfs_add(iter->dev, dev,
8055                                          &iter->dev->adj_list.lower);
8056        }
8057
8058        list_for_each_entry(iter, &dev->adj_list.lower, list) {
8059                if (!net_eq(net, dev_net(iter->dev)))
8060                        continue;
8061                netdev_adjacent_sysfs_del(iter->dev, oldname,
8062                                          &iter->dev->adj_list.upper);
8063                netdev_adjacent_sysfs_add(iter->dev, dev,
8064                                          &iter->dev->adj_list.upper);
8065        }
8066}
8067
8068void *netdev_lower_dev_get_private(struct net_device *dev,
8069                                   struct net_device *lower_dev)
8070{
8071        struct netdev_adjacent *lower;
8072
8073        if (!lower_dev)
8074                return NULL;
8075        lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8076        if (!lower)
8077                return NULL;
8078
8079        return lower->private;
8080}
8081EXPORT_SYMBOL(netdev_lower_dev_get_private);
8082
8083
8084/**
8085 * netdev_lower_change - Dispatch event about lower device state change
8086 * @lower_dev: device
8087 * @lower_state_info: state to dispatch
8088 *
8089 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8090 * The caller must hold the RTNL lock.
8091 */
8092void netdev_lower_state_changed(struct net_device *lower_dev,
8093                                void *lower_state_info)
8094{
8095        struct netdev_notifier_changelowerstate_info changelowerstate_info = {
8096                .info.dev = lower_dev,
8097        };
8098
8099        ASSERT_RTNL();
8100        changelowerstate_info.lower_state_info = lower_state_info;
8101        call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8102                                      &changelowerstate_info.info);
8103}
8104EXPORT_SYMBOL(netdev_lower_state_changed);
8105
8106static void dev_change_rx_flags(struct net_device *dev, int flags)
8107{
8108        const struct net_device_ops *ops = dev->netdev_ops;
8109
8110        if (ops->ndo_change_rx_flags)
8111                ops->ndo_change_rx_flags(dev, flags);
8112}
8113
8114static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
8115{
8116        unsigned int old_flags = dev->flags;
8117        kuid_t uid;
8118        kgid_t gid;
8119
8120        ASSERT_RTNL();
8121
8122        dev->flags |= IFF_PROMISC;
8123        dev->promiscuity += inc;
8124        if (dev->promiscuity == 0) {
8125                /*
8126                 * Avoid overflow.
8127                 * If inc causes overflow, untouch promisc and return error.
8128                 */
8129                if (inc < 0)
8130                        dev->flags &= ~IFF_PROMISC;
8131                else {
8132                        dev->promiscuity -= inc;
8133                        pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
8134                                dev->name);
8135                        return -EOVERFLOW;
8136                }
8137        }
8138        if (dev->flags != old_flags) {
8139                pr_info("device %s %s promiscuous mode\n",
8140                        dev->name,
8141                        dev->flags & IFF_PROMISC ? "entered" : "left");
8142                if (audit_enabled) {
8143                        current_uid_gid(&uid, &gid);
8144                        audit_log(audit_context(), GFP_ATOMIC,
8145                                  AUDIT_ANOM_PROMISCUOUS,
8146                                  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8147                                  dev->name, (dev->flags & IFF_PROMISC),
8148                                  (old_flags & IFF_PROMISC),
8149                                  from_kuid(&init_user_ns, audit_get_loginuid(current)),
8150                                  from_kuid(&init_user_ns, uid),
8151                                  from_kgid(&init_user_ns, gid),
8152                                  audit_get_sessionid(current));
8153                }
8154
8155                dev_change_rx_flags(dev, IFF_PROMISC);
8156        }
8157        if (notify)
8158                __dev_notify_flags(dev, old_flags, IFF_PROMISC);
8159        return 0;
8160}
8161
8162/**
8163 *      dev_set_promiscuity     - update promiscuity count on a device
8164 *      @dev: device
8165 *      @inc: modifier
8166 *
8167 *      Add or remove promiscuity from a device. While the count in the device
8168 *      remains above zero the interface remains promiscuous. Once it hits zero
8169 *      the device reverts back to normal filtering operation. A negative inc
8170 *      value is used to drop promiscuity on the device.
8171 *      Return 0 if successful or a negative errno code on error.
8172 */
8173int dev_set_promiscuity(struct net_device *dev, int inc)
8174{
8175        unsigned int old_flags = dev->flags;
8176        int err;
8177
8178        err = __dev_set_promiscuity(dev, inc, true);
8179        if (err < 0)
8180                return err;
8181        if (dev->flags != old_flags)
8182                dev_set_rx_mode(dev);
8183        return err;
8184}
8185EXPORT_SYMBOL(dev_set_promiscuity);
8186
8187static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
8188{
8189        unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
8190
8191        ASSERT_RTNL();
8192
8193        dev->flags |= IFF_ALLMULTI;
8194        dev->allmulti += inc;
8195        if (dev->allmulti == 0) {
8196                /*
8197                 * Avoid overflow.
8198                 * If inc causes overflow, untouch allmulti and return error.
8199                 */
8200                if (inc < 0)
8201                        dev->flags &= ~IFF_ALLMULTI;
8202                else {
8203                        dev->allmulti -= inc;
8204                        pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
8205                                dev->name);
8206                        return -EOVERFLOW;
8207                }
8208        }
8209        if (dev->flags ^ old_flags) {
8210                dev_change_rx_flags(dev, IFF_ALLMULTI);
8211                dev_set_rx_mode(dev);
8212                if (notify)
8213                        __dev_notify_flags(dev, old_flags,
8214                                           dev->gflags ^ old_gflags);
8215        }
8216        return 0;
8217}
8218
8219/**
8220 *      dev_set_allmulti        - update allmulti count on a device
8221 *      @dev: device
8222 *      @inc: modifier
8223 *
8224 *      Add or remove reception of all multicast frames to a device. While the
8225 *      count in the device remains above zero the interface remains listening
8226 *      to all interfaces. Once it hits zero the device reverts back to normal
8227 *      filtering operation. A negative @inc value is used to drop the counter
8228 *      when releasing a resource needing all multicasts.
8229 *      Return 0 if successful or a negative errno code on error.
8230 */
8231
8232int dev_set_allmulti(struct net_device *dev, int inc)
8233{
8234        return __dev_set_allmulti(dev, inc, true);
8235}
8236EXPORT_SYMBOL(dev_set_allmulti);
8237
8238/*
8239 *      Upload unicast and multicast address lists to device and
8240 *      configure RX filtering. When the device doesn't support unicast
8241 *      filtering it is put in promiscuous mode while unicast addresses
8242 *      are present.
8243 */
8244void __dev_set_rx_mode(struct net_device *dev)
8245{
8246        const struct net_device_ops *ops = dev->netdev_ops;
8247
8248        /* dev_open will call this function so the list will stay sane. */
8249        if (!(dev->flags&IFF_UP))
8250                return;
8251
8252        if (!netif_device_present(dev))
8253                return;
8254
8255        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8256                /* Unicast addresses changes may only happen under the rtnl,
8257                 * therefore calling __dev_set_promiscuity here is safe.
8258                 */
8259                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8260                        __dev_set_promiscuity(dev, 1, false);
8261                        dev->uc_promisc = true;
8262                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8263                        __dev_set_promiscuity(dev, -1, false);
8264                        dev->uc_promisc = false;
8265                }
8266        }
8267
8268        if (ops->ndo_set_rx_mode)
8269                ops->ndo_set_rx_mode(dev);
8270}
8271
8272void dev_set_rx_mode(struct net_device *dev)
8273{
8274        netif_addr_lock_bh(dev);
8275        __dev_set_rx_mode(dev);
8276        netif_addr_unlock_bh(dev);
8277}
8278
8279/**
8280 *      dev_get_flags - get flags reported to userspace
8281 *      @dev: device
8282 *
8283 *      Get the combination of flag bits exported through APIs to userspace.
8284 */
8285unsigned int dev_get_flags(const struct net_device *dev)
8286{
8287        unsigned int flags;
8288
8289        flags = (dev->flags & ~(IFF_PROMISC |
8290                                IFF_ALLMULTI |
8291                                IFF_RUNNING |
8292                                IFF_LOWER_UP |
8293                                IFF_DORMANT)) |
8294                (dev->gflags & (IFF_PROMISC |
8295                                IFF_ALLMULTI));
8296
8297        if (netif_running(dev)) {
8298                if (netif_oper_up(dev))
8299                        flags |= IFF_RUNNING;
8300                if (netif_carrier_ok(dev))
8301                        flags |= IFF_LOWER_UP;
8302                if (netif_dormant(dev))
8303                        flags |= IFF_DORMANT;
8304        }
8305
8306        return flags;
8307}
8308EXPORT_SYMBOL(dev_get_flags);
8309
8310int __dev_change_flags(struct net_device *dev, unsigned int flags,
8311                       struct netlink_ext_ack *extack)
8312{
8313        unsigned int old_flags = dev->flags;
8314        int ret;
8315
8316        ASSERT_RTNL();
8317
8318        /*
8319         *      Set the flags on our device.
8320         */
8321
8322        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8323                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8324                               IFF_AUTOMEDIA)) |
8325                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8326                                    IFF_ALLMULTI));
8327
8328        /*
8329         *      Load in the correct multicast list now the flags have changed.
8330         */
8331
8332        if ((old_flags ^ flags) & IFF_MULTICAST)
8333                dev_change_rx_flags(dev, IFF_MULTICAST);
8334
8335        dev_set_rx_mode(dev);
8336
8337        /*
8338         *      Have we downed the interface. We handle IFF_UP ourselves
8339         *      according to user attempts to set it, rather than blindly
8340         *      setting it.
8341         */
8342
8343        ret = 0;
8344        if ((old_flags ^ flags) & IFF_UP) {
8345                if (old_flags & IFF_UP)
8346                        __dev_close(dev);
8347                else
8348                        ret = __dev_open(dev, extack);
8349        }
8350
8351        if ((flags ^ dev->gflags) & IFF_PROMISC) {
8352                int inc = (flags & IFF_PROMISC) ? 1 : -1;
8353                unsigned int old_flags = dev->flags;
8354
8355                dev->gflags ^= IFF_PROMISC;
8356
8357                if (__dev_set_promiscuity(dev, inc, false) >= 0)
8358                        if (dev->flags != old_flags)
8359                                dev_set_rx_mode(dev);
8360        }
8361
8362        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8363         * is important. Some (broken) drivers set IFF_PROMISC, when
8364         * IFF_ALLMULTI is requested not asking us and not reporting.
8365         */
8366        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8367                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8368
8369                dev->gflags ^= IFF_ALLMULTI;
8370                __dev_set_allmulti(dev, inc, false);
8371        }
8372
8373        return ret;
8374}
8375
8376void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8377                        unsigned int gchanges)
8378{
8379        unsigned int changes = dev->flags ^ old_flags;
8380
8381        if (gchanges)
8382                rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
8383
8384        if (changes & IFF_UP) {
8385                if (dev->flags & IFF_UP)
8386                        call_netdevice_notifiers(NETDEV_UP, dev);
8387                else
8388                        call_netdevice_notifiers(NETDEV_DOWN, dev);
8389        }
8390
8391        if (dev->flags & IFF_UP &&
8392            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8393                struct netdev_notifier_change_info change_info = {
8394                        .info = {
8395                                .dev = dev,
8396                        },
8397                        .flags_changed = changes,
8398                };
8399
8400                call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8401        }
8402}
8403
8404/**
8405 *      dev_change_flags - change device settings
8406 *      @dev: device
8407 *      @flags: device state flags
8408 *      @extack: netlink extended ack
8409 *
8410 *      Change settings on device based state flags. The flags are
8411 *      in the userspace exported format.
8412 */
8413int dev_change_flags(struct net_device *dev, unsigned int flags,
8414                     struct netlink_ext_ack *extack)
8415{
8416        int ret;
8417        unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8418
8419        ret = __dev_change_flags(dev, flags, extack);
8420        if (ret < 0)
8421                return ret;
8422
8423        changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8424        __dev_notify_flags(dev, old_flags, changes);
8425        return ret;
8426}
8427EXPORT_SYMBOL(dev_change_flags);
8428
8429int __dev_set_mtu(struct net_device *dev, int new_mtu)
8430{
8431        const struct net_device_ops *ops = dev->netdev_ops;
8432
8433        if (ops->ndo_change_mtu)
8434                return ops->ndo_change_mtu(dev, new_mtu);
8435
8436        /* Pairs with all the lockless reads of dev->mtu in the stack */
8437        WRITE_ONCE(dev->mtu, new_mtu);
8438        return 0;
8439}
8440EXPORT_SYMBOL(__dev_set_mtu);
8441
8442int dev_validate_mtu(struct net_device *dev, int new_mtu,
8443                     struct netlink_ext_ack *extack)
8444{
8445        /* MTU must be positive, and in range */
8446        if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8447                NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8448                return -EINVAL;
8449        }
8450
8451        if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8452                NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8453                return -EINVAL;
8454        }
8455        return 0;
8456}
8457
8458/**
8459 *      dev_set_mtu_ext - Change maximum transfer unit
8460 *      @dev: device
8461 *      @new_mtu: new transfer unit
8462 *      @extack: netlink extended ack
8463 *
8464 *      Change the maximum transfer size of the network device.
8465 */
8466int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8467                    struct netlink_ext_ack *extack)
8468{
8469        int err, orig_mtu;
8470
8471        if (new_mtu == dev->mtu)
8472                return 0;
8473
8474        err = dev_validate_mtu(dev, new_mtu, extack);
8475        if (err)
8476                return err;
8477
8478        if (!netif_device_present(dev))
8479                return -ENODEV;
8480
8481        err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8482        err = notifier_to_errno(err);
8483        if (err)
8484                return err;
8485
8486        orig_mtu = dev->mtu;
8487        err = __dev_set_mtu(dev, new_mtu);
8488
8489        if (!err) {
8490                err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8491                                                   orig_mtu);
8492                err = notifier_to_errno(err);
8493                if (err) {
8494                        /* setting mtu back and notifying everyone again,
8495                         * so that they have a chance to revert changes.
8496                         */
8497                        __dev_set_mtu(dev, orig_mtu);
8498                        call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8499                                                     new_mtu);
8500                }
8501        }
8502        return err;
8503}
8504
8505int dev_set_mtu(struct net_device *dev, int new_mtu)
8506{
8507        struct netlink_ext_ack extack;
8508        int err;
8509
8510        memset(&extack, 0, sizeof(extack));
8511        err = dev_set_mtu_ext(dev, new_mtu, &extack);
8512        if (err && extack._msg)
8513                net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8514        return err;
8515}
8516EXPORT_SYMBOL(dev_set_mtu);
8517
8518/**
8519 *      dev_change_tx_queue_len - Change TX queue length of a netdevice
8520 *      @dev: device
8521 *      @new_len: new tx queue length
8522 */
8523int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8524{
8525        unsigned int orig_len = dev->tx_queue_len;
8526        int res;
8527
8528        if (new_len != (unsigned int)new_len)
8529                return -ERANGE;
8530
8531        if (new_len != orig_len) {
8532                dev->tx_queue_len = new_len;
8533                res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8534                res = notifier_to_errno(res);
8535                if (res)
8536                        goto err_rollback;
8537                res = dev_qdisc_change_tx_queue_len(dev);
8538                if (res)
8539                        goto err_rollback;
8540        }
8541
8542        return 0;
8543
8544err_rollback:
8545        netdev_err(dev, "refused to change device tx_queue_len\n");
8546        dev->tx_queue_len = orig_len;
8547        return res;
8548}
8549
8550/**
8551 *      dev_set_group - Change group this device belongs to
8552 *      @dev: device
8553 *      @new_group: group this device should belong to
8554 */
8555void dev_set_group(struct net_device *dev, int new_group)
8556{
8557        dev->group = new_group;
8558}
8559EXPORT_SYMBOL(dev_set_group);
8560
8561/**
8562 *      dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8563 *      @dev: device
8564 *      @addr: new address
8565 *      @extack: netlink extended ack
8566 */
8567int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8568                              struct netlink_ext_ack *extack)
8569{
8570        struct netdev_notifier_pre_changeaddr_info info = {
8571                .info.dev = dev,
8572                .info.extack = extack,
8573                .dev_addr = addr,
8574        };
8575        int rc;
8576
8577        rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8578        return notifier_to_errno(rc);
8579}
8580EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8581
8582/**
8583 *      dev_set_mac_address - Change Media Access Control Address
8584 *      @dev: device
8585 *      @sa: new address
8586 *      @extack: netlink extended ack
8587 *
8588 *      Change the hardware (MAC) address of the device
8589 */
8590int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8591                        struct netlink_ext_ack *extack)
8592{
8593        const struct net_device_ops *ops = dev->netdev_ops;
8594        int err;
8595
8596        if (!ops->ndo_set_mac_address)
8597                return -EOPNOTSUPP;
8598        if (sa->sa_family != dev->type)
8599                return -EINVAL;
8600        if (!netif_device_present(dev))
8601                return -ENODEV;
8602        err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8603        if (err)
8604                return err;
8605        err = ops->ndo_set_mac_address(dev, sa);
8606        if (err)
8607                return err;
8608        dev->addr_assign_type = NET_ADDR_SET;
8609        call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8610        add_device_randomness(dev->dev_addr, dev->addr_len);
8611        return 0;
8612}
8613EXPORT_SYMBOL(dev_set_mac_address);
8614
8615/**
8616 *      dev_change_carrier - Change device carrier
8617 *      @dev: device
8618 *      @new_carrier: new value
8619 *
8620 *      Change device carrier
8621 */
8622int dev_change_carrier(struct net_device *dev, bool new_carrier)
8623{
8624        const struct net_device_ops *ops = dev->netdev_ops;
8625
8626        if (!ops->ndo_change_carrier)
8627                return -EOPNOTSUPP;
8628        if (!netif_device_present(dev))
8629                return -ENODEV;
8630        return ops->ndo_change_carrier(dev, new_carrier);
8631}
8632EXPORT_SYMBOL(dev_change_carrier);
8633
8634/**
8635 *      dev_get_phys_port_id - Get device physical port ID
8636 *      @dev: device
8637 *      @ppid: port ID
8638 *
8639 *      Get device physical port ID
8640 */
8641int dev_get_phys_port_id(struct net_device *dev,
8642                         struct netdev_phys_item_id *ppid)
8643{
8644        const struct net_device_ops *ops = dev->netdev_ops;
8645
8646        if (!ops->ndo_get_phys_port_id)
8647                return -EOPNOTSUPP;
8648        return ops->ndo_get_phys_port_id(dev, ppid);
8649}
8650EXPORT_SYMBOL(dev_get_phys_port_id);
8651
8652/**
8653 *      dev_get_phys_port_name - Get device physical port name
8654 *      @dev: device
8655 *      @name: port name
8656 *      @len: limit of bytes to copy to name
8657 *
8658 *      Get device physical port name
8659 */
8660int dev_get_phys_port_name(struct net_device *dev,
8661                           char *name, size_t len)
8662{
8663        const struct net_device_ops *ops = dev->netdev_ops;
8664        int err;
8665
8666        if (ops->ndo_get_phys_port_name) {
8667                err = ops->ndo_get_phys_port_name(dev, name, len);
8668                if (err != -EOPNOTSUPP)
8669                        return err;
8670        }
8671        return devlink_compat_phys_port_name_get(dev, name, len);
8672}
8673EXPORT_SYMBOL(dev_get_phys_port_name);
8674
8675/**
8676 *      dev_get_port_parent_id - Get the device's port parent identifier
8677 *      @dev: network device
8678 *      @ppid: pointer to a storage for the port's parent identifier
8679 *      @recurse: allow/disallow recursion to lower devices
8680 *
8681 *      Get the devices's port parent identifier
8682 */
8683int dev_get_port_parent_id(struct net_device *dev,
8684                           struct netdev_phys_item_id *ppid,
8685                           bool recurse)
8686{
8687        const struct net_device_ops *ops = dev->netdev_ops;
8688        struct netdev_phys_item_id first = { };
8689        struct net_device *lower_dev;
8690        struct list_head *iter;
8691        int err;
8692
8693        if (ops->ndo_get_port_parent_id) {
8694                err = ops->ndo_get_port_parent_id(dev, ppid);
8695                if (err != -EOPNOTSUPP)
8696                        return err;
8697        }
8698
8699        err = devlink_compat_switch_id_get(dev, ppid);
8700        if (!err || err != -EOPNOTSUPP)
8701                return err;
8702
8703        if (!recurse)
8704                return -EOPNOTSUPP;
8705
8706        netdev_for_each_lower_dev(dev, lower_dev, iter) {
8707                err = dev_get_port_parent_id(lower_dev, ppid, recurse);
8708                if (err)
8709                        break;
8710                if (!first.id_len)
8711                        first = *ppid;
8712                else if (memcmp(&first, ppid, sizeof(*ppid)))
8713                        return -EOPNOTSUPP;
8714        }
8715
8716        return err;
8717}
8718EXPORT_SYMBOL(dev_get_port_parent_id);
8719
8720/**
8721 *      netdev_port_same_parent_id - Indicate if two network devices have
8722 *      the same port parent identifier
8723 *      @a: first network device
8724 *      @b: second network device
8725 */
8726bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
8727{
8728        struct netdev_phys_item_id a_id = { };
8729        struct netdev_phys_item_id b_id = { };
8730
8731        if (dev_get_port_parent_id(a, &a_id, true) ||
8732            dev_get_port_parent_id(b, &b_id, true))
8733                return false;
8734
8735        return netdev_phys_item_id_same(&a_id, &b_id);
8736}
8737EXPORT_SYMBOL(netdev_port_same_parent_id);
8738
8739/**
8740 *      dev_change_proto_down - update protocol port state information
8741 *      @dev: device
8742 *      @proto_down: new value
8743 *
8744 *      This info can be used by switch drivers to set the phys state of the
8745 *      port.
8746 */
8747int dev_change_proto_down(struct net_device *dev, bool proto_down)
8748{
8749        const struct net_device_ops *ops = dev->netdev_ops;
8750
8751        if (!ops->ndo_change_proto_down)
8752                return -EOPNOTSUPP;
8753        if (!netif_device_present(dev))
8754                return -ENODEV;
8755        return ops->ndo_change_proto_down(dev, proto_down);
8756}
8757EXPORT_SYMBOL(dev_change_proto_down);
8758
8759/**
8760 *      dev_change_proto_down_generic - generic implementation for
8761 *      ndo_change_proto_down that sets carrier according to
8762 *      proto_down.
8763 *
8764 *      @dev: device
8765 *      @proto_down: new value
8766 */
8767int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
8768{
8769        if (proto_down)
8770                netif_carrier_off(dev);
8771        else
8772                netif_carrier_on(dev);
8773        dev->proto_down = proto_down;
8774        return 0;
8775}
8776EXPORT_SYMBOL(dev_change_proto_down_generic);
8777
8778/**
8779 *      dev_change_proto_down_reason - proto down reason
8780 *
8781 *      @dev: device
8782 *      @mask: proto down mask
8783 *      @value: proto down value
8784 */
8785void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
8786                                  u32 value)
8787{
8788        int b;
8789
8790        if (!mask) {
8791                dev->proto_down_reason = value;
8792        } else {
8793                for_each_set_bit(b, &mask, 32) {
8794                        if (value & (1 << b))
8795                                dev->proto_down_reason |= BIT(b);
8796                        else
8797                                dev->proto_down_reason &= ~BIT(b);
8798                }
8799        }
8800}
8801EXPORT_SYMBOL(dev_change_proto_down_reason);
8802
8803struct bpf_xdp_link {
8804        struct bpf_link link;
8805        struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
8806        int flags;
8807};
8808
8809static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
8810{
8811        if (flags & XDP_FLAGS_HW_MODE)
8812                return XDP_MODE_HW;
8813        if (flags & XDP_FLAGS_DRV_MODE)
8814                return XDP_MODE_DRV;
8815        if (flags & XDP_FLAGS_SKB_MODE)
8816                return XDP_MODE_SKB;
8817        return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
8818}
8819
8820static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
8821{
8822        switch (mode) {
8823        case XDP_MODE_SKB:
8824                return generic_xdp_install;
8825        case XDP_MODE_DRV:
8826        case XDP_MODE_HW:
8827                return dev->netdev_ops->ndo_bpf;
8828        default:
8829                return NULL;
8830        };
8831}
8832
8833static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
8834                                         enum bpf_xdp_mode mode)
8835{
8836        return dev->xdp_state[mode].link;
8837}
8838
8839static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
8840                                     enum bpf_xdp_mode mode)
8841{
8842        struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
8843
8844        if (link)
8845                return link->link.prog;
8846        return dev->xdp_state[mode].prog;
8847}
8848
8849u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
8850{
8851        struct bpf_prog *prog = dev_xdp_prog(dev, mode);
8852
8853        return prog ? prog->aux->id : 0;
8854}
8855
8856static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
8857                             struct bpf_xdp_link *link)
8858{
8859        dev->xdp_state[mode].link = link;
8860        dev->xdp_state[mode].prog = NULL;
8861}
8862
8863static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
8864                             struct bpf_prog *prog)
8865{
8866        dev->xdp_state[mode].link = NULL;
8867        dev->xdp_state[mode].prog = prog;
8868}
8869
8870static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
8871                           bpf_op_t bpf_op, struct netlink_ext_ack *extack,
8872                           u32 flags, struct bpf_prog *prog)
8873{
8874        struct netdev_bpf xdp;
8875        int err;
8876
8877        memset(&xdp, 0, sizeof(xdp));
8878        xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
8879        xdp.extack = extack;
8880        xdp.flags = flags;
8881        xdp.prog = prog;
8882
8883        /* Drivers assume refcnt is already incremented (i.e, prog pointer is
8884         * "moved" into driver), so they don't increment it on their own, but
8885         * they do decrement refcnt when program is detached or replaced.
8886         * Given net_device also owns link/prog, we need to bump refcnt here
8887         * to prevent drivers from underflowing it.
8888         */
8889        if (prog)
8890                bpf_prog_inc(prog);
8891        err = bpf_op(dev, &xdp);
8892        if (err) {
8893                if (prog)
8894                        bpf_prog_put(prog);
8895                return err;
8896        }
8897
8898        if (mode != XDP_MODE_HW)
8899                bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
8900
8901        return 0;
8902}
8903
8904static void dev_xdp_uninstall(struct net_device *dev)
8905{
8906        struct bpf_xdp_link *link;
8907        struct bpf_prog *prog;
8908        enum bpf_xdp_mode mode;
8909        bpf_op_t bpf_op;
8910
8911        ASSERT_RTNL();
8912
8913        for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
8914                prog = dev_xdp_prog(dev, mode);
8915                if (!prog)
8916                        continue;
8917
8918                bpf_op = dev_xdp_bpf_op(dev, mode);
8919                if (!bpf_op)
8920                        continue;
8921
8922                WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
8923
8924                /* auto-detach link from net device */
8925                link = dev_xdp_link(dev, mode);
8926                if (link)
8927                        link->dev = NULL;
8928                else
8929                        bpf_prog_put(prog);
8930
8931                dev_xdp_set_link(dev, mode, NULL);
8932        }
8933}
8934
8935static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
8936                          struct bpf_xdp_link *link, struct bpf_prog *new_prog,
8937                          struct bpf_prog *old_prog, u32 flags)
8938{
8939        struct bpf_prog *cur_prog;
8940        enum bpf_xdp_mode mode;
8941        bpf_op_t bpf_op;
8942        int err;
8943
8944        ASSERT_RTNL();
8945
8946        /* either link or prog attachment, never both */
8947        if (link && (new_prog || old_prog))
8948                return -EINVAL;
8949        /* link supports only XDP mode flags */
8950        if (link && (flags & ~XDP_FLAGS_MODES)) {
8951                NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
8952                return -EINVAL;
8953        }
8954        /* just one XDP mode bit should be set, zero defaults to SKB mode */
8955        if (hweight32(flags & XDP_FLAGS_MODES) > 1) {
8956                NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
8957                return -EINVAL;
8958        }
8959        /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
8960        if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
8961                NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
8962                return -EINVAL;
8963        }
8964
8965        mode = dev_xdp_mode(dev, flags);
8966        /* can't replace attached link */
8967        if (dev_xdp_link(dev, mode)) {
8968                NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
8969                return -EBUSY;
8970        }
8971
8972        cur_prog = dev_xdp_prog(dev, mode);
8973        /* can't replace attached prog with link */
8974        if (link && cur_prog) {
8975                NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
8976                return -EBUSY;
8977        }
8978        if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
8979                NL_SET_ERR_MSG(extack, "Active program does not match expected");
8980                return -EEXIST;
8981        }
8982
8983        /* put effective new program into new_prog */
8984        if (link)
8985                new_prog = link->link.prog;
8986
8987        if (new_prog) {
8988                bool offload = mode == XDP_MODE_HW;
8989                enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
8990                                               ? XDP_MODE_DRV : XDP_MODE_SKB;
8991
8992                if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
8993                        NL_SET_ERR_MSG(extack, "XDP program already attached");
8994                        return -EBUSY;
8995                }
8996                if (!offload && dev_xdp_prog(dev, other_mode)) {
8997                        NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
8998                        return -EEXIST;
8999                }
9000                if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {

9001                        NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
9002                        return -EINVAL;
9003                }
9004                if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9005                        NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9006                        return -EINVAL;
9007                }
9008                if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9009                        NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9010                        return -EINVAL;
9011                }
9012        }
9013
9014        /* don't call drivers if the effective program didn't change */
9015        if (new_prog != cur_prog) {
9016                bpf_op = dev_xdp_bpf_op(dev, mode);
9017                if (!bpf_op) {
9018                        NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9019                        return -EOPNOTSUPP;
9020                }
9021
9022                err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9023                if (err)
9024                        return err;
9025        }
9026
9027        if (link)
9028                dev_xdp_set_link(dev, mode, link);
9029        else
9030                dev_xdp_set_prog(dev, mode, new_prog);
9031        if (cur_prog)
9032                bpf_prog_put(cur_prog);
9033
9034        return 0;
9035}
9036
9037static int dev_xdp_attach_link(struct net_device *dev,
9038                               struct netlink_ext_ack *extack,
9039                               struct bpf_xdp_link *link)
9040{
9041        return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9042}
9043
9044static int dev_xdp_detach_link(struct net_device *dev,
9045                               struct netlink_ext_ack *extack,
9046                               struct bpf_xdp_link *link)
9047{
9048        enum bpf_xdp_mode mode;
9049        bpf_op_t bpf_op;
9050
9051        ASSERT_RTNL();
9052
9053        mode = dev_xdp_mode(dev, link->flags);
9054        if (dev_xdp_link(dev, mode) != link)
9055                return -EINVAL;
9056
9057        bpf_op = dev_xdp_bpf_op(dev, mode);
9058        WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9059        dev_xdp_set_link(dev, mode, NULL);
9060        return 0;
9061}
9062
9063static void bpf_xdp_link_release(struct bpf_link *link)
9064{
9065        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9066
9067        rtnl_lock();
9068
9069        /* if racing with net_device's tear down, xdp_link->dev might be
9070         * already NULL, in which case link was already auto-detached
9071         */
9072        if (xdp_link->dev) {
9073                WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9074                xdp_link->dev = NULL;
9075        }
9076
9077        rtnl_unlock();
9078}
9079
9080static int bpf_xdp_link_detach(struct bpf_link *link)
9081{
9082        bpf_xdp_link_release(link);
9083        return 0;
9084}
9085
9086static void bpf_xdp_link_dealloc(struct bpf_link *link)
9087{
9088        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9089
9090        kfree(xdp_link);
9091}
9092
9093static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9094                                     struct seq_file *seq)
9095{
9096        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9097        u32 ifindex = 0;
9098
9099        rtnl_lock();
9100        if (xdp_link->dev)
9101                ifindex = xdp_link->dev->ifindex;
9102        rtnl_unlock();
9103
9104        seq_printf(seq, "ifindex:\t%u\n", ifindex);
9105}
9106
9107static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9108                                       struct bpf_link_info *info)
9109{
9110        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9111        u32 ifindex = 0;
9112
9113        rtnl_lock();
9114        if (xdp_link->dev)
9115                ifindex = xdp_link->dev->ifindex;
9116        rtnl_unlock();
9117
9118        info->xdp.ifindex = ifindex;
9119        return 0;
9120}
9121
9122static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9123                               struct bpf_prog *old_prog)
9124{
9125        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9126        enum bpf_xdp_mode mode;
9127        bpf_op_t bpf_op;
9128        int err = 0;
9129
9130        rtnl_lock();
9131
9132        /* link might have been auto-released already, so fail */
9133        if (!xdp_link->dev) {
9134                err = -ENOLINK;
9135                goto out_unlock;
9136        }
9137
9138        if (old_prog && link->prog != old_prog) {
9139                err = -EPERM;
9140                goto out_unlock;
9141        }
9142        old_prog = link->prog;
9143        if (old_prog == new_prog) {
9144                /* no-op, don't disturb drivers */
9145                bpf_prog_put(new_prog);
9146                goto out_unlock;
9147        }
9148
9149        mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9150        bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9151        err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9152                              xdp_link->flags, new_prog);
9153        if (err)
9154                goto out_unlock;
9155
9156        old_prog = xchg(&link->prog, new_prog);
9157        bpf_prog_put(old_prog);
9158
9159out_unlock:
9160        rtnl_unlock();
9161        return err;
9162}
9163
9164static const struct bpf_link_ops bpf_xdp_link_lops = {
9165        .release = bpf_xdp_link_release,
9166        .dealloc = bpf_xdp_link_dealloc,
9167        .detach = bpf_xdp_link_detach,
9168        .show_fdinfo = bpf_xdp_link_show_fdinfo,
9169        .fill_link_info = bpf_xdp_link_fill_link_info,
9170        .update_prog = bpf_xdp_link_update,
9171};
9172
9173int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9174{
9175        struct net *net = current->nsproxy->net_ns;
9176        struct bpf_link_primer link_primer;
9177        struct bpf_xdp_link *link;
9178        struct net_device *dev;
9179        int err, fd;
9180
9181        dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9182        if (!dev)
9183                return -EINVAL;
9184
9185        link = kzalloc(sizeof(*link), GFP_USER);
9186        if (!link) {
9187                err = -ENOMEM;
9188                goto out_put_dev;
9189        }
9190
9191        bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9192        link->dev = dev;
9193        link->flags = attr->link_create.flags;
9194
9195        err = bpf_link_prime(&link->link, &link_primer);
9196        if (err) {
9197                kfree(link);
9198                goto out_put_dev;
9199        }
9200
9201        rtnl_lock();
9202        err = dev_xdp_attach_link(dev, NULL, link);
9203        rtnl_unlock();
9204
9205        if (err) {
9206                bpf_link_cleanup(&link_primer);
9207                goto out_put_dev;
9208        }
9209
9210        fd = bpf_link_settle(&link_primer);
9211        /* link itself doesn't hold dev's refcnt to not complicate shutdown */
9212        dev_put(dev);
9213        return fd;
9214
9215out_put_dev:
9216        dev_put(dev);
9217        return err;
9218}
9219
9220/**
9221 *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
9222 *      @dev: device
9223 *      @extack: netlink extended ack
9224 *      @fd: new program fd or negative value to clear
9225 *      @expected_fd: old program fd that userspace expects to replace or clear
9226 *      @flags: xdp-related flags
9227 *
9228 *      Set or clear a bpf program for a device
9229 */
9230int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
9231                      int fd, int expected_fd, u32 flags)
9232{
9233        enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9234        struct bpf_prog *new_prog = NULL, *old_prog = NULL;
9235        int err;
9236
9237        ASSERT_RTNL();
9238
9239        if (fd >= 0) {
9240                new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9241                                                 mode != XDP_MODE_SKB);
9242                if (IS_ERR(new_prog))
9243                        return PTR_ERR(new_prog);
9244        }
9245
9246        if (expected_fd >= 0) {
9247                old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9248                                                 mode != XDP_MODE_SKB);
9249                if (IS_ERR(old_prog)) {
9250                        err = PTR_ERR(old_prog);
9251                        old_prog = NULL;
9252                        goto err_out;
9253                }
9254        }
9255
9256        err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
9257
9258err_out:
9259        if (err && new_prog)
9260                bpf_prog_put(new_prog);
9261        if (old_prog)
9262                bpf_prog_put(old_prog);
9263        return err;
9264}
9265
9266/**
9267 *      dev_new_index   -       allocate an ifindex
9268 *      @net: the applicable net namespace
9269 *
9270 *      Returns a suitable unique value for a new device interface
9271 *      number.  The caller must hold the rtnl semaphore or the
9272 *      dev_base_lock to be sure it remains unique.
9273 */
9274static int dev_new_index(struct net *net)
9275{
9276        int ifindex = net->ifindex;
9277
9278        for (;;) {
9279                if (++ifindex <= 0)
9280                        ifindex = 1;
9281                if (!__dev_get_by_index(net, ifindex))
9282                        return net->ifindex = ifindex;
9283        }
9284}
9285
9286/* Delayed registration/unregisteration */
9287static LIST_HEAD(net_todo_list);
9288DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
9289
9290static void net_set_todo(struct net_device *dev)
9291{
9292        list_add_tail(&dev->todo_list, &net_todo_list);
9293        dev_net(dev)->dev_unreg_count++;
9294}
9295
9296static void rollback_registered_many(struct list_head *head)
9297{
9298        struct net_device *dev, *tmp;
9299        LIST_HEAD(close_head);
9300
9301        BUG_ON(dev_boot_phase);
9302        ASSERT_RTNL();
9303
9304        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9305                /* Some devices call without registering
9306                 * for initialization unwind. Remove those
9307                 * devices and proceed with the remaining.
9308                 */
9309                if (dev->reg_state == NETREG_UNINITIALIZED) {
9310                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
9311                                 dev->name, dev);
9312
9313                        WARN_ON(1);
9314                        list_del(&dev->unreg_list);
9315                        continue;
9316                }
9317                dev->dismantle = true;
9318                BUG_ON(dev->reg_state != NETREG_REGISTERED);
9319        }
9320
9321        /* If device is running, close it first. */
9322        list_for_each_entry(dev, head, unreg_list)
9323                list_add_tail(&dev->close_list, &close_head);
9324        dev_close_many(&close_head, true);
9325
9326        list_for_each_entry(dev, head, unreg_list) {
9327                /* And unlink it from device chain. */
9328                unlist_netdevice(dev);
9329
9330                dev->reg_state = NETREG_UNREGISTERING;
9331        }
9332        flush_all_backlogs();
9333
9334        synchronize_net();
9335
9336        list_for_each_entry(dev, head, unreg_list) {
9337                struct sk_buff *skb = NULL;
9338
9339                /* Shutdown queueing discipline. */
9340                dev_shutdown(dev);
9341
9342                dev_xdp_uninstall(dev);
9343
9344                /* Notify protocols, that we are about to destroy
9345                 * this device. They should clean all the things.
9346                 */
9347                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
9348
9349                if (!dev->rtnl_link_ops ||
9350                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
9351                        skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
9352                                                     GFP_KERNEL, NULL, 0);
9353
9354                /*
9355                 *      Flush the unicast and multicast chains
9356                 */
9357                dev_uc_flush(dev);
9358                dev_mc_flush(dev);
9359
9360                netdev_name_node_alt_flush(dev);
9361                netdev_name_node_free(dev->name_node);
9362
9363                if (dev->netdev_ops->ndo_uninit)
9364                        dev->netdev_ops->ndo_uninit(dev);
9365
9366                if (skb)
9367                        rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
9368
9369                /* Notifier chain MUST detach us all upper devices. */
9370                WARN_ON(netdev_has_any_upper_dev(dev));
9371                WARN_ON(netdev_has_any_lower_dev(dev));
9372
9373                /* Remove entries from kobject tree */
9374                netdev_unregister_kobject(dev);
9375#ifdef CONFIG_XPS
9376                /* Remove XPS queueing entries */
9377                netif_reset_xps_queues_gt(dev, 0);
9378#endif
9379        }
9380
9381        synchronize_net();
9382
9383        list_for_each_entry(dev, head, unreg_list)
9384                dev_put(dev);
9385}
9386
9387static void rollback_registered(struct net_device *dev)
9388{
9389        LIST_HEAD(single);
9390
9391        list_add(&dev->unreg_list, &single);
9392        rollback_registered_many(&single);
9393        list_del(&single);
9394}
9395
9396static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
9397        struct net_device *upper, netdev_features_t features)
9398{
9399        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9400        netdev_features_t feature;
9401        int feature_bit;
9402
9403        for_each_netdev_feature(upper_disables, feature_bit) {
9404                feature = __NETIF_F_BIT(feature_bit);
9405                if (!(upper->wanted_features & feature)
9406                    && (features & feature)) {
9407                        netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
9408                                   &feature, upper->name);
9409                        features &= ~feature;
9410                }
9411        }
9412
9413        return features;
9414}
9415
9416static void netdev_sync_lower_features(struct net_device *upper,
9417        struct net_device *lower, netdev_features_t features)
9418{
9419        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9420        netdev_features_t feature;
9421        int feature_bit;
9422
9423        for_each_netdev_feature(upper_disables, feature_bit) {
9424                feature = __NETIF_F_BIT(feature_bit);
9425                if (!(features & feature) && (lower->features & feature)) {
9426                        netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
9427                                   &feature, lower->name);
9428                        lower->wanted_features &= ~feature;
9429                        __netdev_update_features(lower);
9430
9431                        if (unlikely(lower->features & feature))
9432                                netdev_WARN(upper, "failed to disable %pNF on %s!\n",
9433                                            &feature, lower->name);
9434                        else
9435                                netdev_features_change(lower);
9436                }
9437        }
9438}
9439
9440static netdev_features_t netdev_fix_features(struct net_device *dev,
9441        netdev_features_t features)
9442{
9443        /* Fix illegal checksum combinations */
9444        if ((features & NETIF_F_HW_CSUM) &&
9445            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9446                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9447                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
9448        }
9449
9450        /* TSO requires that SG is present as well. */
9451        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9452                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9453                features &= ~NETIF_F_ALL_TSO;
9454        }
9455
9456        if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
9457                                        !(features & NETIF_F_IP_CSUM)) {
9458                netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
9459                features &= ~NETIF_F_TSO;
9460                features &= ~NETIF_F_TSO_ECN;
9461        }
9462
9463        if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
9464                                         !(features & NETIF_F_IPV6_CSUM)) {
9465                netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
9466                features &= ~NETIF_F_TSO6;
9467        }
9468
9469        /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9470        if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
9471                features &= ~NETIF_F_TSO_MANGLEID;
9472
9473        /* TSO ECN requires that TSO is present as well. */
9474        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
9475                features &= ~NETIF_F_TSO_ECN;
9476
9477        /* Software GSO depends on SG. */
9478        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9479                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9480                features &= ~NETIF_F_GSO;
9481        }
9482
9483        /* GSO partial features require GSO partial be set */
9484        if ((features & dev->gso_partial_features) &&
9485            !(features & NETIF_F_GSO_PARTIAL)) {
9486                netdev_dbg(dev,
9487                           "Dropping partially supported GSO features since no GSO partial.\n");
9488                features &= ~dev->gso_partial_features;
9489        }
9490
9491        if (!(features & NETIF_F_RXCSUM)) {
9492                /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9493                 * successfully merged by hardware must also have the
9494                 * checksum verified by hardware.  If the user does not
9495                 * want to enable RXCSUM, logically, we should disable GRO_HW.
9496                 */
9497                if (features & NETIF_F_GRO_HW) {
9498                        netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9499                        features &= ~NETIF_F_GRO_HW;
9500                }
9501        }
9502
9503        /* LRO/HW-GRO features cannot be combined with RX-FCS */
9504        if (features & NETIF_F_RXFCS) {
9505                if (features & NETIF_F_LRO) {
9506                        netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
9507                        features &= ~NETIF_F_LRO;
9508                }
9509
9510                if (features & NETIF_F_GRO_HW) {
9511                        netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9512                        features &= ~NETIF_F_GRO_HW;
9513                }
9514        }
9515
9516        return features;
9517}
9518
9519int __netdev_update_features(struct net_device *dev)
9520{
9521        struct net_device *upper, *lower;
9522        netdev_features_t features;
9523        struct list_head *iter;
9524        int err = -1;
9525
9526        ASSERT_RTNL();
9527
9528        features = netdev_get_wanted_features(dev);
9529
9530        if (dev->netdev_ops->ndo_fix_features)
9531                features = dev->netdev_ops->ndo_fix_features(dev, features);
9532
9533        /* driver might be less strict about feature dependencies */
9534        features = netdev_fix_features(dev, features);
9535
9536        /* some features can't be enabled if they're off an an upper device */
9537        netdev_for_each_upper_dev_rcu(dev, upper, iter)
9538                features = netdev_sync_upper_features(dev, upper, features);
9539
9540        if (dev->features == features)
9541                goto sync_lower;
9542
9543        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
9544                &dev->features, &features);
9545
9546        if (dev->netdev_ops->ndo_set_features)
9547                err = dev->netdev_ops->ndo_set_features(dev, features);
9548        else
9549                err = 0;
9550
9551        if (unlikely(err < 0)) {
9552                netdev_err(dev,
9553                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
9554                        err, &features, &dev->features);
9555                /* return non-0 since some features might have changed and
9556                 * it's better to fire a spurious notification than miss it
9557                 */
9558                return -1;
9559        }
9560
9561sync_lower:
9562        /* some features must be disabled on lower devices when disabled
9563         * on an upper device (think: bonding master or bridge)
9564         */
9565        netdev_for_each_lower_dev(dev, lower, iter)
9566                netdev_sync_lower_features(dev, lower, features);
9567
9568        if (!err) {
9569                netdev_features_t diff = features ^ dev->features;
9570
9571                if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
9572                        /* udp_tunnel_{get,drop}_rx_info both need
9573                         * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9574                         * device, or they won't do anything.
9575                         * Thus we need to update dev->features
9576                         * *before* calling udp_tunnel_get_rx_info,
9577                         * but *after* calling udp_tunnel_drop_rx_info.
9578                         */
9579                        if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
9580                                dev->features = features;
9581                                udp_tunnel_get_rx_info(dev);
9582                        } else {
9583                                udp_tunnel_drop_rx_info(dev);
9584                        }
9585                }
9586
9587                if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
9588                        if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
9589                                dev->features = features;
9590                                err |= vlan_get_rx_ctag_filter_info(dev);
9591                        } else {
9592                                vlan_drop_rx_ctag_filter_info(dev);
9593                        }
9594                }
9595
9596                if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
9597                        if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
9598                                dev->features = features;
9599                                err |= vlan_get_rx_stag_filter_info(dev);
9600                        } else {
9601                                vlan_drop_rx_stag_filter_info(dev);
9602                        }
9603                }
9604
9605                dev->features = features;
9606        }
9607
9608        return err < 0 ? 0 : 1;
9609}
9610
9611/**
9612 *      netdev_update_features - recalculate device features
9613 *      @dev: the device to check
9614 *
9615 *      Recalculate dev->features set and send notifications if it
9616 *      has changed. Should be called after driver or hardware dependent
9617 *      conditions might have changed that influence the features.
9618 */
9619void netdev_update_features(struct net_device *dev)
9620{
9621        if (__netdev_update_features(dev))
9622                netdev_features_change(dev);
9623}
9624EXPORT_SYMBOL(netdev_update_features);
9625
9626/**
9627 *      netdev_change_features - recalculate device features
9628 *      @dev: the device to check
9629 *
9630 *      Recalculate dev->features set and send notifications even
9631 *      if they have not changed. Should be called instead of
9632 *      netdev_update_features() if also dev->vlan_features might
9633 *      have changed to allow the changes to be propagated to stacked
9634 *      VLAN devices.
9635 */
9636void netdev_change_features(struct net_device *dev)
9637{
9638        __netdev_update_features(dev);
9639        netdev_features_change(dev);
9640}
9641EXPORT_SYMBOL(netdev_change_features);
9642
9643/**
9644 *      netif_stacked_transfer_operstate -      transfer operstate
9645 *      @rootdev: the root or lower level device to transfer state from
9646 *      @dev: the device to transfer operstate to
9647 *
9648 *      Transfer operational state from root to device. This is normally
9649 *      called when a stacking relationship exists between the root
9650 *      device and the device(a leaf device).
9651 */
9652void netif_stacked_transfer_operstate(const struct net_device *rootdev,
9653                                        struct net_device *dev)
9654{
9655        if (rootdev->operstate == IF_OPER_DORMANT)
9656                netif_dormant_on(dev);
9657        else
9658                netif_dormant_off(dev);
9659
9660        if (rootdev->operstate == IF_OPER_TESTING)
9661                netif_testing_on(dev);
9662        else
9663                netif_testing_off(dev);
9664
9665        if (netif_carrier_ok(rootdev))
9666                netif_carrier_on(dev);
9667        else
9668                netif_carrier_off(dev);
9669}
9670EXPORT_SYMBOL(netif_stacked_transfer_operstate);
9671
9672static int netif_alloc_rx_queues(struct net_device *dev)
9673{
9674        unsigned int i, count = dev->num_rx_queues;
9675        struct netdev_rx_queue *rx;
9676        size_t sz = count * sizeof(*rx);
9677        int err = 0;
9678
9679        BUG_ON(count < 1);
9680
9681        rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9682        if (!rx)
9683                return -ENOMEM;
9684
9685        dev->_rx = rx;
9686
9687        for (i = 0; i < count; i++) {
9688                rx[i].dev = dev;
9689
9690                /* XDP RX-queue setup */
9691                err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
9692                if (err < 0)
9693                        goto err_rxq_info;
9694        }
9695        return 0;
9696
9697err_rxq_info:
9698        /* Rollback successful reg's and free other resources */
9699        while (i--)
9700                xdp_rxq_info_unreg(&rx[i].xdp_rxq);
9701        kvfree(dev->_rx);
9702        dev->_rx = NULL;
9703        return err;
9704}
9705
9706static void netif_free_rx_queues(struct net_device *dev)
9707{
9708        unsigned int i, count = dev->num_rx_queues;
9709
9710        /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
9711        if (!dev->_rx)
9712                return;
9713
9714        for (i = 0; i < count; i++)
9715                xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
9716
9717        kvfree(dev->_rx);
9718}
9719
9720static void netdev_init_one_queue(struct net_device *dev,
9721                                  struct netdev_queue *queue, void *_unused)
9722{
9723        /* Initialize queue lock */
9724        spin_lock_init(&queue->_xmit_lock);
9725        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
9726        queue->xmit_lock_owner = -1;
9727        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
9728        queue->dev = dev;
9729#ifdef CONFIG_BQL
9730        dql_init(&queue->dql, HZ);
9731#endif
9732}
9733
9734static void netif_free_tx_queues(struct net_device *dev)
9735{
9736        kvfree(dev->_tx);
9737}
9738
9739static int netif_alloc_netdev_queues(struct net_device *dev)
9740{
9741        unsigned int count = dev->num_tx_queues;
9742        struct netdev_queue *tx;
9743        size_t sz = count * sizeof(*tx);
9744
9745        if (count < 1 || count > 0xffff)
9746                return -EINVAL;
9747
9748        tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9749        if (!tx)
9750                return -ENOMEM;
9751
9752        dev->_tx = tx;
9753
9754        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
9755        spin_lock_init(&dev->tx_global_lock);
9756
9757        return 0;
9758}
9759
9760void netif_tx_stop_all_queues(struct net_device *dev)
9761{
9762        unsigned int i;
9763
9764        for (i = 0; i < dev->num_tx_queues; i++) {
9765                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
9766
9767                netif_tx_stop_queue(txq);
9768        }
9769}
9770EXPORT_SYMBOL(netif_tx_stop_all_queues);
9771
9772/**
9773 *      register_netdevice      - register a network device
9774 *      @dev: device to register
9775 *
9776 *      Take a completed network device structure and add it to the kernel
9777 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
9778 *      chain. 0 is returned on success. A negative errno code is returned
9779 *      on a failure to set up the device, or if the name is a duplicate.
9780 *
9781 *      Callers must hold the rtnl semaphore. You may want
9782 *      register_netdev() instead of this.
9783 *
9784 *      BUGS:
9785 *      The locking appears insufficient to guarantee two parallel registers
9786 *      will not get the same name.
9787 */
9788
9789int register_netdevice(struct net_device *dev)
9790{
9791        int ret;
9792        struct net *net = dev_net(dev);
9793
9794        BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
9795                     NETDEV_FEATURE_COUNT);
9796        BUG_ON(dev_boot_phase);
9797        ASSERT_RTNL();
9798
9799        might_sleep();
9800
9801        /* When net_device's are persistent, this will be fatal. */
9802        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
9803        BUG_ON(!net);
9804
9805        ret = ethtool_check_ops(dev->ethtool_ops);
9806        if (ret)
9807                return ret;
9808
9809        spin_lock_init(&dev->addr_list_lock);
9810        netdev_set_addr_lockdep_class(dev);
9811
9812        ret = dev_get_valid_name(net, dev, dev->name);
9813        if (ret < 0)
9814                goto out;
9815
9816        ret = -ENOMEM;
9817        dev->name_node = netdev_name_node_head_alloc(dev);
9818        if (!dev->name_node)
9819                goto out;
9820
9821        /* Init, if this function is available */
9822        if (dev->netdev_ops->ndo_init) {
9823                ret = dev->netdev_ops->ndo_init(dev);
9824                if (ret) {
9825                        if (ret > 0)
9826                                ret = -EIO;
9827                        goto err_free_name;
9828                }
9829        }
9830
9831        if (((dev->hw_features | dev->features) &
9832             NETIF_F_HW_VLAN_CTAG_FILTER) &&
9833            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
9834             !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
9835                netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
9836                ret = -EINVAL;
9837                goto err_uninit;
9838        }
9839
9840        ret = -EBUSY;
9841        if (!dev->ifindex)
9842                dev->ifindex = dev_new_index(net);
9843        else if (__dev_get_by_index(net, dev->ifindex))
9844                goto err_uninit;
9845
9846        /* Transfer changeable features to wanted_features and enable
9847         * software offloads (GSO and GRO).
9848         */
9849        dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
9850        dev->features |= NETIF_F_SOFT_FEATURES;
9851
9852        if (dev->netdev_ops->ndo_udp_tunnel_add) {
9853                dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9854                dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9855        }
9856
9857        dev->wanted_features = dev->features & dev->hw_features;
9858
9859        if (!(dev->flags & IFF_LOOPBACK))
9860                dev->hw_features |= NETIF_F_NOCACHE_COPY;
9861
9862        /* If IPv4 TCP segmentation offload is supported we should also
9863         * allow the device to enable segmenting the frame with the option
9864         * of ignoring a static IP ID value.  This doesn't enable the
9865         * feature itself but allows the user to enable it later.
9866         */
9867        if (dev->hw_features & NETIF_F_TSO)
9868                dev->hw_features |= NETIF_F_TSO_MANGLEID;
9869        if (dev->vlan_features & NETIF_F_TSO)
9870                dev->vlan_features |= NETIF_F_TSO_MANGLEID;
9871        if (dev->mpls_features & NETIF_F_TSO)
9872                dev->mpls_features |= NETIF_F_TSO_MANGLEID;
9873        if (dev->hw_enc_features & NETIF_F_TSO)
9874                dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
9875
9876        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
9877         */
9878        dev->vlan_features |= NETIF_F_HIGHDMA;
9879
9880        /* Make NETIF_F_SG inheritable to tunnel devices.
9881         */
9882        dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
9883
9884        /* Make NETIF_F_SG inheritable to MPLS.
9885         */
9886        dev->mpls_features |= NETIF_F_SG;
9887
9888        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
9889        ret = notifier_to_errno(ret);
9890        if (ret)
9891                goto err_uninit;
9892
9893        ret = netdev_register_kobject(dev);
9894        if (ret) {
9895                dev->reg_state = NETREG_UNREGISTERED;
9896                goto err_uninit;
9897        }
9898        dev->reg_state = NETREG_REGISTERED;
9899
9900        __netdev_update_features(dev);
9901
9902        /*
9903         *      Default initial state at registry is that the
9904         *      device is present.
9905         */
9906
9907        set_bit(__LINK_STATE_PRESENT, &dev->state);
9908
9909        linkwatch_init_dev(dev);
9910
9911        dev_init_scheduler(dev);
9912        dev_hold(dev);
9913        list_netdevice(dev);
9914        add_device_randomness(dev->dev_addr, dev->addr_len);
9915
9916        /* If the device has permanent device address, driver should
9917         * set dev_addr and also addr_assign_type should be set to
9918         * NET_ADDR_PERM (default value).
9919         */
9920        if (dev->addr_assign_type == NET_ADDR_PERM)
9921                memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
9922
9923        /* Notify protocols, that a new device appeared. */
9924        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
9925        ret = notifier_to_errno(ret);
9926        if (ret) {
9927                rollback_registered(dev);
9928                rcu_barrier();
9929
9930                dev->reg_state = NETREG_UNREGISTERED;
9931                /* We should put the kobject that hold in
9932                 * netdev_unregister_kobject(), otherwise
9933                 * the net device cannot be freed when
9934                 * driver calls free_netdev(), because the
9935                 * kobject is being hold.
9936                 */
9937                kobject_put(&dev->dev.kobj);
9938        }
9939        /*
9940         *      Prevent userspace races by waiting until the network
9941         *      device is fully setup before sending notifications.
9942         */
9943        if (!dev->rtnl_link_ops ||
9944            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
9945                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
9946
9947out:
9948        return ret;
9949
9950err_uninit:
9951        if (dev->netdev_ops->ndo_uninit)
9952                dev->netdev_ops->ndo_uninit(dev);
9953        if (dev->priv_destructor)
9954                dev->priv_destructor(dev);
9955err_free_name:
9956        netdev_name_node_free(dev->name_node);
9957        goto out;
9958}
9959EXPORT_SYMBOL(register_netdevice);
9960
9961/**
9962 *      init_dummy_netdev       - init a dummy network device for NAPI
9963 *      @dev: device to init
9964 *
9965 *      This takes a network device structure and initialize the minimum
9966 *      amount of fields so it can be used to schedule NAPI polls without
9967 *      registering a full blown interface. This is to be used by drivers
9968 *      that need to tie several hardware interfaces to a single NAPI
9969 *      poll scheduler due to HW limitations.
9970 */
9971int init_dummy_netdev(struct net_device *dev)
9972{
9973        /* Clear everything. Note we don't initialize spinlocks
9974         * are they aren't supposed to be taken by any of the
9975         * NAPI code and this dummy netdev is supposed to be
9976         * only ever used for NAPI polls
9977         */
9978        memset(dev, 0, sizeof(struct net_device));
9979
9980        /* make sure we BUG if trying to hit standard
9981         * register/unregister code path
9982         */
9983        dev->reg_state = NETREG_DUMMY;
9984
9985        /* NAPI wants this */
9986        INIT_LIST_HEAD(&dev->napi_list);
9987
9988        /* a dummy interface is started by default */
9989        set_bit(__LINK_STATE_PRESENT, &dev->state);
9990        set_bit(__LINK_STATE_START, &dev->state);
9991
9992        /* napi_busy_loop stats accounting wants this */
9993        dev_net_set(dev, &init_net);
9994
9995        /* Note : We dont allocate pcpu_refcnt for dummy devices,
9996         * because users of this 'device' dont need to change
9997         * its refcount.
9998         */
9999
10000        return 0;

10001}
10002EXPORT_SYMBOL_GPL(init_dummy_netdev);
10003
10004
10005/**
10006 *      register_netdev - register a network device
10007 *      @dev: device to register
10008 *
10009 *      Take a completed network device structure and add it to the kernel
10010 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10011 *      chain. 0 is returned on success. A negative errno code is returned
10012 *      on a failure to set up the device, or if the name is a duplicate.
10013 *
10014 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
10015 *      and expands the device name if you passed a format string to
10016 *      alloc_netdev.
10017 */
10018int register_netdev(struct net_device *dev)
10019{
10020        int err;
10021
10022        if (rtnl_lock_killable())
10023                return -EINTR;
10024        err = register_netdevice(dev);
10025        rtnl_unlock();
10026        return err;
10027}
10028EXPORT_SYMBOL(register_netdev);
10029
10030int netdev_refcnt_read(const struct net_device *dev)
10031{
10032        int i, refcnt = 0;
10033
10034        for_each_possible_cpu(i)
10035                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10036        return refcnt;
10037}
10038EXPORT_SYMBOL(netdev_refcnt_read);
10039
10040/**
10041 * netdev_wait_allrefs - wait until all references are gone.
10042 * @dev: target net_device
10043 *
10044 * This is called when unregistering network devices.
10045 *
10046 * Any protocol or device that holds a reference should register
10047 * for netdevice notification, and cleanup and put back the
10048 * reference if they receive an UNREGISTER event.
10049 * We can get stuck here if buggy protocols don't correctly
10050 * call dev_put.
10051 */
10052static void netdev_wait_allrefs(struct net_device *dev)
10053{
10054        unsigned long rebroadcast_time, warning_time;
10055        int refcnt;
10056
10057        linkwatch_forget_dev(dev);
10058
10059        rebroadcast_time = warning_time = jiffies;
10060        refcnt = netdev_refcnt_read(dev);
10061
10062        while (refcnt != 0) {
10063                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10064                        rtnl_lock();
10065
10066                        /* Rebroadcast unregister notification */
10067                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10068
10069                        __rtnl_unlock();
10070                        rcu_barrier();
10071                        rtnl_lock();
10072
10073                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10074                                     &dev->state)) {
10075                                /* We must not have linkwatch events
10076                                 * pending on unregister. If this
10077                                 * happens, we simply run the queue
10078                                 * unscheduled, resulting in a noop
10079                                 * for this device.
10080                                 */
10081                                linkwatch_run_queue();
10082                        }
10083
10084                        __rtnl_unlock();
10085
10086                        rebroadcast_time = jiffies;
10087                }
10088
10089                msleep(250);
10090
10091                refcnt = netdev_refcnt_read(dev);
10092
10093                if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
10094                        pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10095                                 dev->name, refcnt);
10096                        warning_time = jiffies;
10097                }
10098        }
10099}
10100
10101/* The sequence is:
10102 *
10103 *      rtnl_lock();
10104 *      ...
10105 *      register_netdevice(x1);
10106 *      register_netdevice(x2);
10107 *      ...
10108 *      unregister_netdevice(y1);
10109 *      unregister_netdevice(y2);
10110 *      ...
10111 *      rtnl_unlock();
10112 *      free_netdev(y1);
10113 *      free_netdev(y2);
10114 *
10115 * We are invoked by rtnl_unlock().
10116 * This allows us to deal with problems:
10117 * 1) We can delete sysfs objects which invoke hotplug
10118 *    without deadlocking with linkwatch via keventd.
10119 * 2) Since we run with the RTNL semaphore not held, we can sleep
10120 *    safely in order to wait for the netdev refcnt to drop to zero.
10121 *
10122 * We must not return until all unregister events added during
10123 * the interval the lock was held have been completed.
10124 */
10125void netdev_run_todo(void)
10126{
10127        struct list_head list;
10128#ifdef CONFIG_LOCKDEP
10129        struct list_head unlink_list;
10130
10131        list_replace_init(&net_unlink_list, &unlink_list);
10132
10133        while (!list_empty(&unlink_list)) {
10134                struct net_device *dev = list_first_entry(&unlink_list,
10135                                                          struct net_device,
10136                                                          unlink_list);
10137                list_del(&dev->unlink_list);
10138                dev->nested_level = dev->lower_level - 1;
10139        }
10140#endif
10141
10142        /* Snapshot list, allow later requests */
10143        list_replace_init(&net_todo_list, &list);
10144
10145        __rtnl_unlock();
10146
10147
10148        /* Wait for rcu callbacks to finish before next phase */
10149        if (!list_empty(&list))
10150                rcu_barrier();
10151
10152        while (!list_empty(&list)) {
10153                struct net_device *dev
10154                        = list_first_entry(&list, struct net_device, todo_list);
10155                list_del(&dev->todo_list);
10156
10157                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10158                        pr_err("network todo '%s' but state %d\n",
10159                               dev->name, dev->reg_state);
10160                        dump_stack();
10161                        continue;
10162                }
10163
10164                dev->reg_state = NETREG_UNREGISTERED;
10165
10166                netdev_wait_allrefs(dev);
10167
10168                /* paranoia */
10169                BUG_ON(netdev_refcnt_read(dev));
10170                BUG_ON(!list_empty(&dev->ptype_all));
10171                BUG_ON(!list_empty(&dev->ptype_specific));
10172                WARN_ON(rcu_access_pointer(dev->ip_ptr));
10173                WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10174#if IS_ENABLED(CONFIG_DECNET)
10175                WARN_ON(dev->dn_ptr);
10176#endif
10177                if (dev->priv_destructor)
10178                        dev->priv_destructor(dev);
10179                if (dev->needs_free_netdev)
10180                        free_netdev(dev);
10181
10182                /* Report a network device has been unregistered */
10183                rtnl_lock();
10184                dev_net(dev)->dev_unreg_count--;
10185                __rtnl_unlock();
10186                wake_up(&netdev_unregistering_wq);
10187
10188                /* Free network device */
10189                kobject_put(&dev->dev.kobj);
10190        }
10191}
10192
10193/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10194 * all the same fields in the same order as net_device_stats, with only
10195 * the type differing, but rtnl_link_stats64 may have additional fields
10196 * at the end for newer counters.
10197 */
10198void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10199                             const struct net_device_stats *netdev_stats)
10200{
10201#if BITS_PER_LONG == 64
10202        BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
10203        memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
10204        /* zero out counters that only exist in rtnl_link_stats64 */
10205        memset((char *)stats64 + sizeof(*netdev_stats), 0,
10206               sizeof(*stats64) - sizeof(*netdev_stats));
10207#else
10208        size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
10209        const unsigned long *src = (const unsigned long *)netdev_stats;
10210        u64 *dst = (u64 *)stats64;
10211
10212        BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10213        for (i = 0; i < n; i++)
10214                dst[i] = src[i];
10215        /* zero out counters that only exist in rtnl_link_stats64 */
10216        memset((char *)stats64 + n * sizeof(u64), 0,
10217               sizeof(*stats64) - n * sizeof(u64));
10218#endif
10219}
10220EXPORT_SYMBOL(netdev_stats_to_stats64);
10221
10222/**
10223 *      dev_get_stats   - get network device statistics
10224 *      @dev: device to get statistics from
10225 *      @storage: place to store stats
10226 *
10227 *      Get network statistics from device. Return @storage.
10228 *      The device driver may provide its own method by setting
10229 *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10230 *      otherwise the internal statistics structure is used.
10231 */
10232struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10233                                        struct rtnl_link_stats64 *storage)
10234{
10235        const struct net_device_ops *ops = dev->netdev_ops;
10236
10237        if (ops->ndo_get_stats64) {
10238                memset(storage, 0, sizeof(*storage));
10239                ops->ndo_get_stats64(dev, storage);
10240        } else if (ops->ndo_get_stats) {
10241                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10242        } else {
10243                netdev_stats_to_stats64(storage, &dev->stats);
10244        }
10245        storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
10246        storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
10247        storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
10248        return storage;
10249}
10250EXPORT_SYMBOL(dev_get_stats);
10251
10252struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10253{
10254        struct netdev_queue *queue = dev_ingress_queue(dev);
10255
10256#ifdef CONFIG_NET_CLS_ACT
10257        if (queue)
10258                return queue;
10259        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10260        if (!queue)
10261                return NULL;
10262        netdev_init_one_queue(dev, queue, NULL);
10263        RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10264        queue->qdisc_sleeping = &noop_qdisc;
10265        rcu_assign_pointer(dev->ingress_queue, queue);
10266#endif
10267        return queue;
10268}
10269
10270static const struct ethtool_ops default_ethtool_ops;
10271
10272void netdev_set_default_ethtool_ops(struct net_device *dev,
10273                                    const struct ethtool_ops *ops)
10274{
10275        if (dev->ethtool_ops == &default_ethtool_ops)
10276                dev->ethtool_ops = ops;
10277}
10278EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10279
10280void netdev_freemem(struct net_device *dev)
10281{
10282        char *addr = (char *)dev - dev->padded;
10283
10284        kvfree(addr);
10285}
10286
10287/**
10288 * alloc_netdev_mqs - allocate network device
10289 * @sizeof_priv: size of private data to allocate space for
10290 * @name: device name format string
10291 * @name_assign_type: origin of device name
10292 * @setup: callback to initialize device
10293 * @txqs: the number of TX subqueues to allocate
10294 * @rxqs: the number of RX subqueues to allocate
10295 *
10296 * Allocates a struct net_device with private data area for driver use
10297 * and performs basic initialization.  Also allocates subqueue structs
10298 * for each queue on the device.
10299 */
10300struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10301                unsigned char name_assign_type,
10302                void (*setup)(struct net_device *),
10303                unsigned int txqs, unsigned int rxqs)
10304{
10305        struct net_device *dev;
10306        unsigned int alloc_size;
10307        struct net_device *p;
10308
10309        BUG_ON(strlen(name) >= sizeof(dev->name));
10310
10311        if (txqs < 1) {
10312                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10313                return NULL;
10314        }
10315
10316        if (rxqs < 1) {
10317                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10318                return NULL;
10319        }
10320
10321        alloc_size = sizeof(struct net_device);
10322        if (sizeof_priv) {
10323                /* ensure 32-byte alignment of private area */
10324                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10325                alloc_size += sizeof_priv;
10326        }
10327        /* ensure 32-byte alignment of whole construct */
10328        alloc_size += NETDEV_ALIGN - 1;
10329
10330        p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
10331        if (!p)
10332                return NULL;
10333
10334        dev = PTR_ALIGN(p, NETDEV_ALIGN);
10335        dev->padded = (char *)dev - (char *)p;
10336
10337        dev->pcpu_refcnt = alloc_percpu(int);
10338        if (!dev->pcpu_refcnt)
10339                goto free_dev;
10340
10341        if (dev_addr_init(dev))
10342                goto free_pcpu;
10343
10344        dev_mc_init(dev);
10345        dev_uc_init(dev);
10346
10347        dev_net_set(dev, &init_net);
10348
10349        dev->gso_max_size = GSO_MAX_SIZE;
10350        dev->gso_max_segs = GSO_MAX_SEGS;
10351        dev->upper_level = 1;
10352        dev->lower_level = 1;
10353#ifdef CONFIG_LOCKDEP
10354        dev->nested_level = 0;
10355        INIT_LIST_HEAD(&dev->unlink_list);
10356#endif
10357
10358        INIT_LIST_HEAD(&dev->napi_list);
10359        INIT_LIST_HEAD(&dev->unreg_list);
10360        INIT_LIST_HEAD(&dev->close_list);
10361        INIT_LIST_HEAD(&dev->link_watch_list);
10362        INIT_LIST_HEAD(&dev->adj_list.upper);
10363        INIT_LIST_HEAD(&dev->adj_list.lower);
10364        INIT_LIST_HEAD(&dev->ptype_all);
10365        INIT_LIST_HEAD(&dev->ptype_specific);
10366        INIT_LIST_HEAD(&dev->net_notifier_list);
10367#ifdef CONFIG_NET_SCHED
10368        hash_init(dev->qdisc_hash);
10369#endif
10370        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10371        setup(dev);
10372
10373        if (!dev->tx_queue_len) {
10374                dev->priv_flags |= IFF_NO_QUEUE;
10375                dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10376        }
10377
10378        dev->num_tx_queues = txqs;
10379        dev->real_num_tx_queues = txqs;
10380        if (netif_alloc_netdev_queues(dev))
10381                goto free_all;
10382
10383        dev->num_rx_queues = rxqs;
10384        dev->real_num_rx_queues = rxqs;
10385        if (netif_alloc_rx_queues(dev))
10386                goto free_all;
10387
10388        strcpy(dev->name, name);
10389        dev->name_assign_type = name_assign_type;
10390        dev->group = INIT_NETDEV_GROUP;
10391        if (!dev->ethtool_ops)
10392                dev->ethtool_ops = &default_ethtool_ops;
10393
10394        nf_hook_ingress_init(dev);
10395
10396        return dev;
10397
10398free_all:
10399        free_netdev(dev);
10400        return NULL;
10401
10402free_pcpu:
10403        free_percpu(dev->pcpu_refcnt);
10404free_dev:
10405        netdev_freemem(dev);
10406        return NULL;
10407}
10408EXPORT_SYMBOL(alloc_netdev_mqs);
10409
10410/**
10411 * free_netdev - free network device
10412 * @dev: device
10413 *
10414 * This function does the last stage of destroying an allocated device
10415 * interface. The reference to the device object is released. If this
10416 * is the last reference then it will be freed.Must be called in process
10417 * context.
10418 */
10419void free_netdev(struct net_device *dev)
10420{
10421        struct napi_struct *p, *n;
10422
10423        might_sleep();
10424        netif_free_tx_queues(dev);
10425        netif_free_rx_queues(dev);
10426
10427        kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10428
10429        /* Flush device addresses */
10430        dev_addr_flush(dev);
10431
10432        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10433                netif_napi_del(p);
10434
10435        free_percpu(dev->pcpu_refcnt);
10436        dev->pcpu_refcnt = NULL;
10437        free_percpu(dev->xdp_bulkq);
10438        dev->xdp_bulkq = NULL;
10439
10440        /*  Compatibility with error handling in drivers */
10441        if (dev->reg_state == NETREG_UNINITIALIZED) {
10442                netdev_freemem(dev);
10443                return;
10444        }
10445
10446        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10447        dev->reg_state = NETREG_RELEASED;
10448
10449        /* will free via device release */
10450        put_device(&dev->dev);
10451}
10452EXPORT_SYMBOL(free_netdev);
10453
10454/**
10455 *      synchronize_net -  Synchronize with packet receive processing
10456 *
10457 *      Wait for packets currently being received to be done.
10458 *      Does not block later packets from starting.
10459 */
10460void synchronize_net(void)
10461{
10462        might_sleep();
10463        if (rtnl_is_locked())
10464                synchronize_rcu_expedited();
10465        else
10466                synchronize_rcu();
10467}
10468EXPORT_SYMBOL(synchronize_net);
10469
10470/**
10471 *      unregister_netdevice_queue - remove device from the kernel
10472 *      @dev: device
10473 *      @head: list
10474 *
10475 *      This function shuts down a device interface and removes it
10476 *      from the kernel tables.
10477 *      If head not NULL, device is queued to be unregistered later.
10478 *
10479 *      Callers must hold the rtnl semaphore.  You may want
10480 *      unregister_netdev() instead of this.
10481 */
10482
10483void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10484{
10485        ASSERT_RTNL();
10486
10487        if (head) {
10488                list_move_tail(&dev->unreg_list, head);
10489        } else {
10490                rollback_registered(dev);
10491                /* Finish processing unregister after unlock */
10492                net_set_todo(dev);
10493        }
10494}
10495EXPORT_SYMBOL(unregister_netdevice_queue);
10496
10497/**
10498 *      unregister_netdevice_many - unregister many devices
10499 *      @head: list of devices
10500 *
10501 *  Note: As most callers use a stack allocated list_head,
10502 *  we force a list_del() to make sure stack wont be corrupted later.
10503 */
10504void unregister_netdevice_many(struct list_head *head)
10505{
10506        struct net_device *dev;
10507
10508        if (!list_empty(head)) {
10509                rollback_registered_many(head);
10510                list_for_each_entry(dev, head, unreg_list)
10511                        net_set_todo(dev);
10512                list_del(head);
10513        }
10514}
10515EXPORT_SYMBOL(unregister_netdevice_many);
10516
10517/**
10518 *      unregister_netdev - remove device from the kernel
10519 *      @dev: device
10520 *
10521 *      This function shuts down a device interface and removes it
10522 *      from the kernel tables.
10523 *
10524 *      This is just a wrapper for unregister_netdevice that takes
10525 *      the rtnl semaphore.  In general you want to use this and not
10526 *      unregister_netdevice.
10527 */
10528void unregister_netdev(struct net_device *dev)
10529{
10530        rtnl_lock();
10531        unregister_netdevice(dev);
10532        rtnl_unlock();
10533}
10534EXPORT_SYMBOL(unregister_netdev);
10535
10536/**
10537 *      dev_change_net_namespace - move device to different nethost namespace
10538 *      @dev: device
10539 *      @net: network namespace
10540 *      @pat: If not NULL name pattern to try if the current device name
10541 *            is already taken in the destination network namespace.
10542 *
10543 *      This function shuts down a device interface and moves it
10544 *      to a new network namespace. On success 0 is returned, on
10545 *      a failure a netagive errno code is returned.
10546 *
10547 *      Callers must hold the rtnl semaphore.
10548 */
10549
10550int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
10551{
10552        struct net *net_old = dev_net(dev);
10553        int err, new_nsid, new_ifindex;
10554
10555        ASSERT_RTNL();
10556
10557        /* Don't allow namespace local devices to be moved. */
10558        err = -EINVAL;
10559        if (dev->features & NETIF_F_NETNS_LOCAL)
10560                goto out;
10561
10562        /* Ensure the device has been registrered */
10563        if (dev->reg_state != NETREG_REGISTERED)
10564                goto out;
10565
10566        /* Get out if there is nothing todo */
10567        err = 0;
10568        if (net_eq(net_old, net))
10569                goto out;
10570
10571        /* Pick the destination device name, and ensure
10572         * we can use it in the destination network namespace.
10573         */
10574        err = -EEXIST;
10575        if (__dev_get_by_name(net, dev->name)) {
10576                /* We get here if we can't use the current device name */
10577                if (!pat)
10578                        goto out;
10579                err = dev_get_valid_name(net, dev, pat);
10580                if (err < 0)
10581                        goto out;
10582        }
10583
10584        /*
10585         * And now a mini version of register_netdevice unregister_netdevice.
10586         */
10587
10588        /* If device is running close it first. */
10589        dev_close(dev);
10590
10591        /* And unlink it from device chain */
10592        unlist_netdevice(dev);
10593
10594        synchronize_net();
10595
10596        /* Shutdown queueing discipline. */
10597        dev_shutdown(dev);
10598
10599        /* Notify protocols, that we are about to destroy
10600         * this device. They should clean all the things.
10601         *
10602         * Note that dev->reg_state stays at NETREG_REGISTERED.
10603         * This is wanted because this way 8021q and macvlan know
10604         * the device is just moving and can keep their slaves up.
10605         */
10606        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10607        rcu_barrier();
10608
10609        new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10610        /* If there is an ifindex conflict assign a new one */
10611        if (__dev_get_by_index(net, dev->ifindex))
10612                new_ifindex = dev_new_index(net);
10613        else
10614                new_ifindex = dev->ifindex;
10615
10616        rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
10617                            new_ifindex);
10618
10619        /*
10620         *      Flush the unicast and multicast chains
10621         */
10622        dev_uc_flush(dev);
10623        dev_mc_flush(dev);
10624
10625        /* Send a netdev-removed uevent to the old namespace */
10626        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
10627        netdev_adjacent_del_links(dev);
10628
10629        /* Move per-net netdevice notifiers that are following the netdevice */
10630        move_netdevice_notifiers_dev_net(dev, net);
10631
10632        /* Actually switch the network namespace */
10633        dev_net_set(dev, net);
10634        dev->ifindex = new_ifindex;
10635
10636        /* Send a netdev-add uevent to the new namespace */
10637        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
10638        netdev_adjacent_add_links(dev);
10639
10640        /* Fixup kobjects */
10641        err = device_rename(&dev->dev, dev->name);
10642        WARN_ON(err);
10643
10644        /* Adapt owner in case owning user namespace of target network
10645         * namespace is different from the original one.
10646         */
10647        err = netdev_change_owner(dev, net_old, net);
10648        WARN_ON(err);
10649
10650        /* Add the device back in the hashes */
10651        list_netdevice(dev);
10652
10653        /* Notify protocols, that a new device appeared. */
10654        call_netdevice_notifiers(NETDEV_REGISTER, dev);
10655
10656        /*
10657         *      Prevent userspace races by waiting until the network
10658         *      device is fully setup before sending notifications.
10659         */
10660        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10661
10662        synchronize_net();
10663        err = 0;
10664out:
10665        return err;
10666}
10667EXPORT_SYMBOL_GPL(dev_change_net_namespace);
10668
10669static int dev_cpu_dead(unsigned int oldcpu)
10670{
10671        struct sk_buff **list_skb;
10672        struct sk_buff *skb;
10673        unsigned int cpu;
10674        struct softnet_data *sd, *oldsd, *remsd = NULL;
10675
10676        local_irq_disable();
10677        cpu = smp_processor_id();
10678        sd = &per_cpu(softnet_data, cpu);
10679        oldsd = &per_cpu(softnet_data, oldcpu);
10680
10681        /* Find end of our completion_queue. */
10682        list_skb = &sd->completion_queue;
10683        while (*list_skb)
10684                list_skb = &(*list_skb)->next;
10685        /* Append completion queue from offline CPU. */
10686        *list_skb = oldsd->completion_queue;
10687        oldsd->completion_queue = NULL;
10688
10689        /* Append output queue from offline CPU. */
10690        if (oldsd->output_queue) {
10691                *sd->output_queue_tailp = oldsd->output_queue;
10692                sd->output_queue_tailp = oldsd->output_queue_tailp;
10693                oldsd->output_queue = NULL;
10694                oldsd->output_queue_tailp = &oldsd->output_queue;
10695        }
10696        /* Append NAPI poll list from offline CPU, with one exception :
10697         * process_backlog() must be called by cpu owning percpu backlog.
10698         * We properly handle process_queue & input_pkt_queue later.
10699         */
10700        while (!list_empty(&oldsd->poll_list)) {
10701                struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
10702                                                            struct napi_struct,
10703                                                            poll_list);
10704
10705                list_del_init(&napi->poll_list);
10706                if (napi->poll == process_backlog)
10707                        napi->state = 0;
10708                else
10709                        ____napi_schedule(sd, napi);
10710        }
10711
10712        raise_softirq_irqoff(NET_TX_SOFTIRQ);
10713        local_irq_enable();
10714
10715#ifdef CONFIG_RPS
10716        remsd = oldsd->rps_ipi_list;
10717        oldsd->rps_ipi_list = NULL;
10718#endif
10719        /* send out pending IPI's on offline CPU */
10720        net_rps_send_ipi(remsd);
10721
10722        /* Process offline CPU's input_pkt_queue */
10723        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
10724                netif_rx_ni(skb);
10725                input_queue_head_incr(oldsd);
10726        }
10727        while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
10728                netif_rx_ni(skb);
10729                input_queue_head_incr(oldsd);
10730        }
10731
10732        return 0;
10733}
10734
10735/**
10736 *      netdev_increment_features - increment feature set by one
10737 *      @all: current feature set
10738 *      @one: new feature set
10739 *      @mask: mask feature set
10740 *
10741 *      Computes a new feature set after adding a device with feature set
10742 *      @one to the master device with current feature set @all.  Will not
10743 *      enable anything that is off in @mask. Returns the new feature set.
10744 */
10745netdev_features_t netdev_increment_features(netdev_features_t all,
10746        netdev_features_t one, netdev_features_t mask)
10747{
10748        if (mask & NETIF_F_HW_CSUM)
10749                mask |= NETIF_F_CSUM_MASK;
10750        mask |= NETIF_F_VLAN_CHALLENGED;
10751
10752        all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
10753        all &= one | ~NETIF_F_ALL_FOR_ALL;
10754
10755        /* If one device supports hw checksumming, set for all. */
10756        if (all & NETIF_F_HW_CSUM)
10757                all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
10758
10759        return all;
10760}
10761EXPORT_SYMBOL(netdev_increment_features);
10762
10763static struct hlist_head * __net_init netdev_create_hash(void)
10764{
10765        int i;
10766        struct hlist_head *hash;
10767
10768        hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
10769        if (hash != NULL)
10770                for (i = 0; i < NETDEV_HASHENTRIES; i++)
10771                        INIT_HLIST_HEAD(&hash[i]);
10772
10773        return hash;
10774}
10775
10776/* Initialize per network namespace state */
10777static int __net_init netdev_init(struct net *net)
10778{
10779        BUILD_BUG_ON(GRO_HASH_BUCKETS >
10780                     8 * sizeof_field(struct napi_struct, gro_bitmask));
10781
10782        if (net != &init_net)
10783                INIT_LIST_HEAD(&net->dev_base_head);
10784
10785        net->dev_name_head = netdev_create_hash();
10786        if (net->dev_name_head == NULL)
10787                goto err_name;
10788
10789        net->dev_index_head = netdev_create_hash();
10790        if (net->dev_index_head == NULL)
10791                goto err_idx;
10792
10793        RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
10794
10795        return 0;
10796
10797err_idx:
10798        kfree(net->dev_name_head);
10799err_name:
10800        return -ENOMEM;
10801}
10802
10803/**
10804 *      netdev_drivername - network driver for the device
10805 *      @dev: network device
10806 *
10807 *      Determine network driver for device.
10808 */
10809const char *netdev_drivername(const struct net_device *dev)
10810{
10811        const struct device_driver *driver;
10812        const struct device *parent;
10813        const char *empty = "";
10814
10815        parent = dev->dev.parent;
10816        if (!parent)
10817                return empty;
10818
10819        driver = parent->driver;
10820        if (driver && driver->name)
10821                return driver->name;
10822        return empty;
10823}
10824
10825static void __netdev_printk(const char *level, const struct net_device *dev,
10826                            struct va_format *vaf)
10827{
10828        if (dev && dev->dev.parent) {
10829                dev_printk_emit(level[1] - '0',
10830                                dev->dev.parent,
10831                                "%s %s %s%s: %pV",
10832                                dev_driver_string(dev->dev.parent),
10833                                dev_name(dev->dev.parent),
10834                                netdev_name(dev), netdev_reg_state(dev),
10835                                vaf);
10836        } else if (dev) {
10837                printk("%s%s%s: %pV",
10838                       level, netdev_name(dev), netdev_reg_state(dev), vaf);
10839        } else {
10840                printk("%s(NULL net_device): %pV", level, vaf);
10841        }
10842}
10843
10844void netdev_printk(const char *level, const struct net_device *dev,
10845                   const char *format, ...)
10846{
10847        struct va_format vaf;
10848        va_list args;
10849
10850        va_start(args, format);
10851
10852        vaf.fmt = format;
10853        vaf.va = &args;
10854
10855        __netdev_printk(level, dev, &vaf);
10856
10857        va_end(args);
10858}
10859EXPORT_SYMBOL(netdev_printk);
10860
10861#define define_netdev_printk_level(func, level)                 \
10862void func(const struct net_device *dev, const char *fmt, ...)   \
10863{                                                               \
10864        struct va_format vaf;                                   \
10865        va_list args;                                           \
10866                                                                \
10867        va_start(args, fmt);                                    \
10868                                                                \
10869        vaf.fmt = fmt;                                          \
10870        vaf.va = &args;                                         \
10871                                                                \
10872        __netdev_printk(level, dev, &vaf);                      \
10873                                                                \
10874        va_end(args);                                           \
10875}                                                               \
10876EXPORT_SYMBOL(func);
10877
10878define_netdev_printk_level(netdev_emerg, KERN_EMERG);
10879define_netdev_printk_level(netdev_alert, KERN_ALERT);
10880define_netdev_printk_level(netdev_crit, KERN_CRIT);
10881define_netdev_printk_level(netdev_err, KERN_ERR);
10882define_netdev_printk_level(netdev_warn, KERN_WARNING);
10883define_netdev_printk_level(netdev_notice, KERN_NOTICE);
10884define_netdev_printk_level(netdev_info, KERN_INFO);
10885
10886static void __net_exit netdev_exit(struct net *net)
10887{
10888        kfree(net->dev_name_head);
10889        kfree(net->dev_index_head);
10890        if (net != &init_net)
10891                WARN_ON_ONCE(!list_empty(&net->dev_base_head));
10892}
10893
10894static struct pernet_operations __net_initdata netdev_net_ops = {
10895        .init = netdev_init,
10896        .exit = netdev_exit,
10897};
10898
10899static void __net_exit default_device_exit(struct net *net)
10900{
10901        struct net_device *dev, *aux;
10902        /*
10903         * Push all migratable network devices back to the
10904         * initial network namespace
10905         */
10906        rtnl_lock();
10907        for_each_netdev_safe(net, dev, aux) {
10908                int err;
10909                char fb_name[IFNAMSIZ];
10910
10911                /* Ignore unmoveable devices (i.e. loopback) */
10912                if (dev->features & NETIF_F_NETNS_LOCAL)
10913                        continue;
10914
10915                /* Leave virtual devices for the generic cleanup */
10916                if (dev->rtnl_link_ops)
10917                        continue;
10918
10919                /* Push remaining network devices to init_net */
10920                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
10921                if (__dev_get_by_name(&init_net, fb_name))
10922                        snprintf(fb_name, IFNAMSIZ, "dev%%d");
10923                err = dev_change_net_namespace(dev, &init_net, fb_name);
10924                if (err) {
10925                        pr_emerg("%s: failed to move %s to init_net: %d\n",
10926                                 __func__, dev->name, err);
10927                        BUG();
10928                }
10929        }
10930        rtnl_unlock();
10931}
10932
10933static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
10934{
10935        /* Return with the rtnl_lock held when there are no network
10936         * devices unregistering in any network namespace in net_list.
10937         */
10938        struct net *net;
10939        bool unregistering;
10940        DEFINE_WAIT_FUNC(wait, woken_wake_function);
10941
10942        add_wait_queue(&netdev_unregistering_wq, &wait);
10943        for (;;) {
10944                unregistering = false;
10945                rtnl_lock();
10946                list_for_each_entry(net, net_list, exit_list) {
10947                        if (net->dev_unreg_count > 0) {
10948                                unregistering = true;
10949                                break;
10950                        }
10951                }
10952                if (!unregistering)
10953                        break;
10954                __rtnl_unlock();
10955
10956                wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
10957        }
10958        remove_wait_queue(&netdev_unregistering_wq, &wait);
10959}
10960
10961static void __net_exit default_device_exit_batch(struct list_head *net_list)
10962{
10963        /* At exit all network devices most be removed from a network
10964         * namespace.  Do this in the reverse order of registration.
10965         * Do this across as many network namespaces as possible to
10966         * improve batching efficiency.
10967         */
10968        struct net_device *dev;
10969        struct net *net;
10970        LIST_HEAD(dev_kill_list);
10971
10972        /* To prevent network device cleanup code from dereferencing
10973         * loopback devices or network devices that have been freed
10974         * wait here for all pending unregistrations to complete,
10975         * before unregistring the loopback device and allowing the
10976         * network namespace be freed.
10977         *
10978         * The netdev todo list containing all network devices
10979         * unregistrations that happen in default_device_exit_batch
10980         * will run in the rtnl_unlock() at the end of
10981         * default_device_exit_batch.
10982         */
10983        rtnl_lock_unregistering(net_list);
10984        list_for_each_entry(net, net_list, exit_list) {
10985                for_each_netdev_reverse(net, dev) {
10986                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
10987                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
10988                        else
10989                                unregister_netdevice_queue(dev, &dev_kill_list);
10990                }
10991        }
10992        unregister_netdevice_many(&dev_kill_list);
10993        rtnl_unlock();
10994}
10995
10996static struct pernet_operations __net_initdata default_device_ops = {
10997        .exit = default_device_exit,
10998        .exit_batch = default_device_exit_batch,
10999};
11000

11001/*
11002 *      Initialize the DEV module. At boot time this walks the device list and
11003 *      unhooks any devices that fail to initialise (normally hardware not
11004 *      present) and leaves us with a valid list of present and active devices.
11005 *
11006 */
11007
11008/*
11009 *       This is called single threaded during boot, so no need
11010 *       to take the rtnl semaphore.
11011 */
11012static int __init net_dev_init(void)
11013{
11014        int i, rc = -ENOMEM;
11015
11016        BUG_ON(!dev_boot_phase);
11017
11018        if (dev_proc_init())
11019                goto out;
11020
11021        if (netdev_kobject_init())
11022                goto out;
11023
11024        INIT_LIST_HEAD(&ptype_all);
11025        for (i = 0; i < PTYPE_HASH_SIZE; i++)
11026                INIT_LIST_HEAD(&ptype_base[i]);
11027
11028        INIT_LIST_HEAD(&offload_base);
11029
11030        if (register_pernet_subsys(&netdev_net_ops))
11031                goto out;
11032
11033        /*
11034         *      Initialise the packet receive queues.
11035         */
11036
11037        for_each_possible_cpu(i) {
11038                struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11039                struct softnet_data *sd = &per_cpu(softnet_data, i);
11040
11041                INIT_WORK(flush, flush_backlog);
11042
11043                skb_queue_head_init(&sd->input_pkt_queue);
11044                skb_queue_head_init(&sd->process_queue);
11045#ifdef CONFIG_XFRM_OFFLOAD
11046                skb_queue_head_init(&sd->xfrm_backlog);
11047#endif
11048                INIT_LIST_HEAD(&sd->poll_list);
11049                sd->output_queue_tailp = &sd->output_queue;
11050#ifdef CONFIG_RPS
11051                sd->csd.func = rps_trigger_softirq;
11052                sd->csd.info = sd;
11053                sd->cpu = i;
11054#endif
11055
11056                init_gro_hash(&sd->backlog);
11057                sd->backlog.poll = process_backlog;
11058                sd->backlog.weight = weight_p;
11059        }
11060
11061        dev_boot_phase = 0;
11062
11063        /* The loopback device is special if any other network devices
11064         * is present in a network namespace the loopback device must
11065         * be present. Since we now dynamically allocate and free the
11066         * loopback device ensure this invariant is maintained by
11067         * keeping the loopback device as the first device on the
11068         * list of network devices.  Ensuring the loopback devices
11069         * is the first device that appears and the last network device
11070         * that disappears.
11071         */
11072        if (register_pernet_device(&loopback_net_ops))
11073                goto out;
11074
11075        if (register_pernet_device(&default_device_ops))
11076                goto out;
11077
11078        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11079        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11080
11081        rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11082                                       NULL, dev_cpu_dead);
11083        WARN_ON(rc < 0);
11084        rc = 0;
11085out:
11086        return rc;
11087}
11088
11089subsys_initcall(net_dev_init);
11090