net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 static DEFINE_SPINLOCK(ptype_lock);
 148 static DEFINE_SPINLOCK(offload_lock);
 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 150 struct list_head ptype_all __read_mostly;       /* Taps */
 151 static struct list_head offload_base __read_mostly;
 152
 153 static int netif_rx_internal(struct sk_buff *skb);
 154 static int call_netdevice_notifiers_info(unsigned long val,
 155                                          struct net_device *dev,
 156                                          struct netdev_notifier_info *info);
 157
 158 /*
 159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 160  * semaphore.
 161  *
 162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 163  *
 164  * Writers must hold the rtnl semaphore while they loop through the
 165  * dev_base_head list, and hold dev_base_lock for writing when they do the
 166  * actual updates.  This allows pure readers to access the list even
 167  * while a writer is preparing to update it.
 168  *
 169  * To put it another way, dev_base_lock is held for writing only to
 170  * protect against pure readers; the rtnl semaphore provides the
 171  * protection against other writers.
 172  *
 173  * See, for example usages, register_netdevice() and
 174  * unregister_netdevice(), which must be called with the rtnl
 175  * semaphore held.
 176  */
 177 DEFINE_RWLOCK(dev_base_lock);
 178 EXPORT_SYMBOL(dev_base_lock);
 179
 180 /* protects napi_hash addition/deletion and napi_gen_id */
 181 static DEFINE_SPINLOCK(napi_hash_lock);
 182
 183 static unsigned int napi_gen_id;
 184 static DEFINE_HASHTABLE(napi_hash, 8);
 185
 186 static seqcount_t devnet_rename_seq;
 187
 188 static inline void dev_base_seq_inc(struct net *net)
 189 {
 190         while (++net->dev_base_seq == 0);
 191 }
 192
 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 194 {
 195         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 196
 197         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 198 }
 199
 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 201 {
 202         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 203 }
 204
 205 static inline void rps_lock(struct softnet_data *sd)
 206 {
 207 #ifdef CONFIG_RPS
 208         spin_lock(&sd->input_pkt_queue.lock);
 209 #endif
 210 }
 211
 212 static inline void rps_unlock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_unlock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 /* Device list insertion */
 220 static void list_netdevice(struct net_device *dev)
 221 {
 222         struct net *net = dev_net(dev);
 223
 224         ASSERT_RTNL();
 225
 226         write_lock_bh(&dev_base_lock);
 227         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 228         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 229         hlist_add_head_rcu(&dev->index_hlist,
 230                            dev_index_hash(net, dev->ifindex));
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(net);
 234 }
 235
 236 /* Device list removal
 237  * caller must respect a RCU grace period before freeing/reusing dev
 238  */
 239 static void unlist_netdevice(struct net_device *dev)
 240 {
 241         ASSERT_RTNL();
 242
 243         /* Unlink dev from the device chain */
 244         write_lock_bh(&dev_base_lock);
 245         list_del_rcu(&dev->dev_list);
 246         hlist_del_rcu(&dev->name_hlist);
 247         hlist_del_rcu(&dev->index_hlist);
 248         write_unlock_bh(&dev_base_lock);
 249
 250         dev_base_seq_inc(dev_net(dev));
 251 }
 252
 253 /*
 254  *      Our notifier list
 255  */
 256
 257 static RAW_NOTIFIER_HEAD(netdev_chain);
 258
 259 /*
 260  *      Device drivers call our routines to queue packets here. We empty the
 261  *      queue in the local softnet handler.
 262  */
 263
 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 265 EXPORT_PER_CPU_SYMBOL(softnet_data);
 266
 267 #ifdef CONFIG_LOCKDEP
 268 /*
 269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 270  * according to dev->type
 271  */
 272 static const unsigned short netdev_lock_type[] =
 273         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 274          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 275          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 276          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 277          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 278          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 279          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 280          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 281          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 282          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 283          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 284          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 285          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 286          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 287          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 288
 289 static const char *const netdev_lock_name[] =
 290         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 291          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 292          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 293          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 294          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 295          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 296          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 297          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 298          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 299          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 300          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 301          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 302          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 303          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 304          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 305
 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308
 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 310 {
 311         int i;
 312
 313         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 314                 if (netdev_lock_type[i] == dev_type)
 315                         return i;
 316         /* the last key is used by default */
 317         return ARRAY_SIZE(netdev_lock_type) - 1;
 318 }
 319
 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 321                                                  unsigned short dev_type)
 322 {
 323         int i;
 324
 325         i = netdev_lock_pos(dev_type);
 326         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329
 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 331 {
 332         int i;
 333
 334         i = netdev_lock_pos(dev->type);
 335         lockdep_set_class_and_name(&dev->addr_list_lock,
 336                                    &netdev_addr_lock_key[i],
 337                                    netdev_lock_name[i]);
 338 }
 339 #else
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343 }
 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345 {
 346 }
 347 #endif
 348
 349 /*******************************************************************************
 350
 351                 Protocol management and registration routines
 352
 353 *******************************************************************************/
 354
 355 /*
 356  *      Add a protocol ID to the list. Now that the input handler is
 357  *      smarter we can dispense with all the messy stuff that used to be
 358  *      here.
 359  *
 360  *      BEWARE!!! Protocol handlers, mangling input packets,
 361  *      MUST BE last in hash buckets and checking protocol handlers
 362  *      MUST start from promiscuous ptype_all chain in net_bh.
 363  *      It is true now, do not change it.
 364  *      Explanation follows: if protocol handler, mangling packet, will
 365  *      be the first on list, it is not able to sense, that packet
 366  *      is cloned and should be copied-on-write, so that it will
 367  *      change it and subsequent readers will get broken packet.
 368  *                                                      --ANK (980803)
 369  */
 370
 371 static inline struct list_head *ptype_head(const struct packet_type *pt)
 372 {
 373         if (pt->type == htons(ETH_P_ALL))
 374                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 375         else
 376                 return pt->dev ? &pt->dev->ptype_specific :
 377                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 378 }
 379
 380 /**
 381  *      dev_add_pack - add packet handler
 382  *      @pt: packet type declaration
 383  *
 384  *      Add a protocol handler to the networking stack. The passed &packet_type
 385  *      is linked into kernel lists and may not be freed until it has been
 386  *      removed from the kernel lists.
 387  *
 388  *      This call does not sleep therefore it can not
 389  *      guarantee all CPU's that are in middle of receiving packets
 390  *      will see the new packet type (until the next received packet).
 391  */
 392
 393 void dev_add_pack(struct packet_type *pt)
 394 {
 395         struct list_head *head = ptype_head(pt);
 396
 397         spin_lock(&ptype_lock);
 398         list_add_rcu(&pt->list, head);
 399         spin_unlock(&ptype_lock);
 400 }
 401 EXPORT_SYMBOL(dev_add_pack);
 402
 403 /**
 404  *      __dev_remove_pack        - remove packet handler
 405  *      @pt: packet type declaration
 406  *
 407  *      Remove a protocol handler that was previously added to the kernel
 408  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 409  *      from the kernel lists and can be freed or reused once this function
 410  *      returns.
 411  *
 412  *      The packet type might still be in use by receivers
 413  *      and must not be freed until after all the CPU's have gone
 414  *      through a quiescent state.
 415  */
 416 void __dev_remove_pack(struct packet_type *pt)
 417 {
 418         struct list_head *head = ptype_head(pt);
 419         struct packet_type *pt1;
 420
 421         spin_lock(&ptype_lock);
 422
 423         list_for_each_entry(pt1, head, list) {
 424                 if (pt == pt1) {
 425                         list_del_rcu(&pt->list);
 426                         goto out;
 427                 }
 428         }
 429
 430         pr_warn("dev_remove_pack: %p not found\n", pt);
 431 out:
 432         spin_unlock(&ptype_lock);
 433 }
 434 EXPORT_SYMBOL(__dev_remove_pack);
 435
 436 /**
 437  *      dev_remove_pack  - remove packet handler
 438  *      @pt: packet type declaration
 439  *
 440  *      Remove a protocol handler that was previously added to the kernel
 441  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 442  *      from the kernel lists and can be freed or reused once this function
 443  *      returns.
 444  *
 445  *      This call sleeps to guarantee that no CPU is looking at the packet
 446  *      type after return.
 447  */
 448 void dev_remove_pack(struct packet_type *pt)
 449 {
 450         __dev_remove_pack(pt);
 451
 452         synchronize_net();
 453 }
 454 EXPORT_SYMBOL(dev_remove_pack);
 455
 456
 457 /**
 458  *      dev_add_offload - register offload handlers
 459  *      @po: protocol offload declaration
 460  *
 461  *      Add protocol offload handlers to the networking stack. The passed
 462  *      &proto_offload is linked into kernel lists and may not be freed until
 463  *      it has been removed from the kernel lists.
 464  *
 465  *      This call does not sleep therefore it can not
 466  *      guarantee all CPU's that are in middle of receiving packets
 467  *      will see the new offload handlers (until the next received packet).
 468  */
 469 void dev_add_offload(struct packet_offload *po)
 470 {
 471         struct list_head *head = &offload_base;
 472
 473         spin_lock(&offload_lock);
 474         list_add_rcu(&po->list, head);
 475         spin_unlock(&offload_lock);
 476 }
 477 EXPORT_SYMBOL(dev_add_offload);
 478
 479 /**
 480  *      __dev_remove_offload     - remove offload handler
 481  *      @po: packet offload declaration
 482  *
 483  *      Remove a protocol offload handler that was previously added to the
 484  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 485  *      is removed from the kernel lists and can be freed or reused once this
 486  *      function returns.
 487  *
 488  *      The packet type might still be in use by receivers
 489  *      and must not be freed until after all the CPU's have gone
 490  *      through a quiescent state.
 491  */
 492 static void __dev_remove_offload(struct packet_offload *po)
 493 {
 494         struct list_head *head = &offload_base;
 495         struct packet_offload *po1;
 496
 497         spin_lock(&offload_lock);
 498
 499         list_for_each_entry(po1, head, list) {
 500                 if (po == po1) {
 501                         list_del_rcu(&po->list);
 502                         goto out;
 503                 }
 504         }
 505
 506         pr_warn("dev_remove_offload: %p not found\n", po);
 507 out:
 508         spin_unlock(&offload_lock);
 509 }
 510
 511 /**
 512  *      dev_remove_offload       - remove packet offload handler
 513  *      @po: packet offload declaration
 514  *
 515  *      Remove a packet offload handler that was previously added to the kernel
 516  *      offload handlers by dev_add_offload(). The passed &offload_type is
 517  *      removed from the kernel lists and can be freed or reused once this
 518  *      function returns.
 519  *
 520  *      This call sleeps to guarantee that no CPU is looking at the packet
 521  *      type after return.
 522  */
 523 void dev_remove_offload(struct packet_offload *po)
 524 {
 525         __dev_remove_offload(po);
 526
 527         synchronize_net();
 528 }
 529 EXPORT_SYMBOL(dev_remove_offload);
 530
 531 /******************************************************************************
 532
 533                       Device Boot-time Settings Routines
 534
 535 *******************************************************************************/
 536
 537 /* Boot time configuration table */
 538 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 539
 540 /**
 541  *      netdev_boot_setup_add   - add new setup entry
 542  *      @name: name of the device
 543  *      @map: configured settings for the device
 544  *
 545  *      Adds new setup entry to the dev_boot_setup list.  The function
 546  *      returns 0 on error and 1 on success.  This is a generic routine to
 547  *      all netdevices.
 548  */
 549 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 550 {
 551         struct netdev_boot_setup *s;
 552         int i;
 553
 554         s = dev_boot_setup;
 555         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 556                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 557                         memset(s[i].name, 0, sizeof(s[i].name));
 558                         strlcpy(s[i].name, name, IFNAMSIZ);
 559                         memcpy(&s[i].map, map, sizeof(s[i].map));
 560                         break;
 561                 }
 562         }
 563
 564         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 565 }
 566
 567 /**
 568  *      netdev_boot_setup_check - check boot time settings
 569  *      @dev: the netdevice
 570  *
 571  *      Check boot time settings for the device.
 572  *      The found settings are set for the device to be used
 573  *      later in the device probing.
 574  *      Returns 0 if no settings found, 1 if they are.
 575  */
 576 int netdev_boot_setup_check(struct net_device *dev)
 577 {
 578         struct netdev_boot_setup *s = dev_boot_setup;
 579         int i;
 580
 581         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 582                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 583                     !strcmp(dev->name, s[i].name)) {
 584                         dev->irq        = s[i].map.irq;
 585                         dev->base_addr  = s[i].map.base_addr;
 586                         dev->mem_start  = s[i].map.mem_start;
 587                         dev->mem_end    = s[i].map.mem_end;
 588                         return 1;
 589                 }
 590         }
 591         return 0;
 592 }
 593 EXPORT_SYMBOL(netdev_boot_setup_check);
 594
 595
 596 /**
 597  *      netdev_boot_base        - get address from boot time settings
 598  *      @prefix: prefix for network device
 599  *      @unit: id for network device
 600  *
 601  *      Check boot time settings for the base address of device.
 602  *      The found settings are set for the device to be used
 603  *      later in the device probing.
 604  *      Returns 0 if no settings found.
 605  */
 606 unsigned long netdev_boot_base(const char *prefix, int unit)
 607 {
 608         const struct netdev_boot_setup *s = dev_boot_setup;
 609         char name[IFNAMSIZ];
 610         int i;
 611
 612         sprintf(name, "%s%d", prefix, unit);
 613
 614         /*
 615          * If device already registered then return base of 1
 616          * to indicate not to probe for this interface
 617          */
 618         if (__dev_get_by_name(&init_net, name))
 619                 return 1;
 620
 621         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 622                 if (!strcmp(name, s[i].name))
 623                         return s[i].map.base_addr;
 624         return 0;
 625 }
 626
 627 /*
 628  * Saves at boot time configured settings for any netdevice.
 629  */
 630 int __init netdev_boot_setup(char *str)
 631 {
 632         int ints[5];
 633         struct ifmap map;
 634
 635         str = get_options(str, ARRAY_SIZE(ints), ints);
 636         if (!str || !*str)
 637                 return 0;
 638
 639         /* Save settings */
 640         memset(&map, 0, sizeof(map));
 641         if (ints[0] > 0)
 642                 map.irq = ints[1];
 643         if (ints[0] > 1)
 644                 map.base_addr = ints[2];
 645         if (ints[0] > 2)
 646                 map.mem_start = ints[3];
 647         if (ints[0] > 3)
 648                 map.mem_end = ints[4];
 649
 650         /* Add new entry to the list */
 651         return netdev_boot_setup_add(str, &map);
 652 }
 653
 654 __setup("netdev=", netdev_boot_setup);
 655
 656 /*******************************************************************************
 657
 658                             Device Interface Subroutines
 659
 660 *******************************************************************************/
 661
 662 /**
 663  *      dev_get_iflink  - get 'iflink' value of a interface
 664  *      @dev: targeted interface
 665  *
 666  *      Indicates the ifindex the interface is linked to.
 667  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 668  */
 669
 670 int dev_get_iflink(const struct net_device *dev)
 671 {
 672         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 673                 return dev->netdev_ops->ndo_get_iflink(dev);
 674
 675         return dev->ifindex;
 676 }
 677 EXPORT_SYMBOL(dev_get_iflink);
 678
 679 /**
 680  *      __dev_get_by_name       - find a device by its name
 681  *      @net: the applicable net namespace
 682  *      @name: name to find
 683  *
 684  *      Find an interface by name. Must be called under RTNL semaphore
 685  *      or @dev_base_lock. If the name is found a pointer to the device
 686  *      is returned. If the name is not found then %NULL is returned. The
 687  *      reference counters are not incremented so the caller must be
 688  *      careful with locks.
 689  */
 690
 691 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 692 {
 693         struct net_device *dev;
 694         struct hlist_head *head = dev_name_hash(net, name);
 695
 696         hlist_for_each_entry(dev, head, name_hlist)
 697                 if (!strncmp(dev->name, name, IFNAMSIZ))
 698                         return dev;
 699
 700         return NULL;
 701 }
 702 EXPORT_SYMBOL(__dev_get_by_name);
 703
 704 /**
 705  *      dev_get_by_name_rcu     - find a device by its name
 706  *      @net: the applicable net namespace
 707  *      @name: name to find
 708  *
 709  *      Find an interface by name.
 710  *      If the name is found a pointer to the device is returned.
 711  *      If the name is not found then %NULL is returned.
 712  *      The reference counters are not incremented so the caller must be
 713  *      careful with locks. The caller must hold RCU lock.
 714  */
 715
 716 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 717 {
 718         struct net_device *dev;
 719         struct hlist_head *head = dev_name_hash(net, name);
 720
 721         hlist_for_each_entry_rcu(dev, head, name_hlist)
 722                 if (!strncmp(dev->name, name, IFNAMSIZ))
 723                         return dev;
 724
 725         return NULL;
 726 }
 727 EXPORT_SYMBOL(dev_get_by_name_rcu);
 728
 729 /**
 730  *      dev_get_by_name         - find a device by its name
 731  *      @net: the applicable net namespace
 732  *      @name: name to find
 733  *
 734  *      Find an interface by name. This can be called from any
 735  *      context and does its own locking. The returned handle has
 736  *      the usage count incremented and the caller must use dev_put() to
 737  *      release it when it is no longer needed. %NULL is returned if no
 738  *      matching device is found.
 739  */
 740
 741 struct net_device *dev_get_by_name(struct net *net, const char *name)
 742 {
 743         struct net_device *dev;
 744
 745         rcu_read_lock();
 746         dev = dev_get_by_name_rcu(net, name);
 747         if (dev)
 748                 dev_hold(dev);
 749         rcu_read_unlock();
 750         return dev;
 751 }
 752 EXPORT_SYMBOL(dev_get_by_name);
 753
 754 /**
 755  *      __dev_get_by_index - find a device by its ifindex
 756  *      @net: the applicable net namespace
 757  *      @ifindex: index of device
 758  *
 759  *      Search for an interface by index. Returns %NULL if the device
 760  *      is not found or a pointer to the device. The device has not
 761  *      had its reference counter increased so the caller must be careful
 762  *      about locking. The caller must hold either the RTNL semaphore
 763  *      or @dev_base_lock.
 764  */
 765
 766 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 767 {
 768         struct net_device *dev;
 769         struct hlist_head *head = dev_index_hash(net, ifindex);
 770
 771         hlist_for_each_entry(dev, head, index_hlist)
 772                 if (dev->ifindex == ifindex)
 773                         return dev;
 774
 775         return NULL;
 776 }
 777 EXPORT_SYMBOL(__dev_get_by_index);
 778
 779 /**
 780  *      dev_get_by_index_rcu - find a device by its ifindex
 781  *      @net: the applicable net namespace
 782  *      @ifindex: index of device
 783  *
 784  *      Search for an interface by index. Returns %NULL if the device
 785  *      is not found or a pointer to the device. The device has not
 786  *      had its reference counter increased so the caller must be careful
 787  *      about locking. The caller must hold RCU lock.
 788  */
 789
 790 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 791 {
 792         struct net_device *dev;
 793         struct hlist_head *head = dev_index_hash(net, ifindex);
 794
 795         hlist_for_each_entry_rcu(dev, head, index_hlist)
 796                 if (dev->ifindex == ifindex)
 797                         return dev;
 798
 799         return NULL;
 800 }
 801 EXPORT_SYMBOL(dev_get_by_index_rcu);
 802
 803
 804 /**
 805  *      dev_get_by_index - find a device by its ifindex
 806  *      @net: the applicable net namespace
 807  *      @ifindex: index of device
 808  *
 809  *      Search for an interface by index. Returns NULL if the device
 810  *      is not found or a pointer to the device. The device returned has
 811  *      had a reference added and the pointer is safe until the user calls
 812  *      dev_put to indicate they have finished with it.
 813  */
 814
 815 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 816 {
 817         struct net_device *dev;
 818
 819         rcu_read_lock();
 820         dev = dev_get_by_index_rcu(net, ifindex);
 821         if (dev)
 822                 dev_hold(dev);
 823         rcu_read_unlock();
 824         return dev;
 825 }
 826 EXPORT_SYMBOL(dev_get_by_index);
 827
 828 /**
 829  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 830  *      @net: network namespace
 831  *      @name: a pointer to the buffer where the name will be stored.
 832  *      @ifindex: the ifindex of the interface to get the name from.
 833  *
 834  *      The use of raw_seqcount_begin() and cond_resched() before
 835  *      retrying is required as we want to give the writers a chance
 836  *      to complete when CONFIG_PREEMPT is not set.
 837  */
 838 int netdev_get_name(struct net *net, char *name, int ifindex)
 839 {
 840         struct net_device *dev;
 841         unsigned int seq;
 842
 843 retry:
 844         seq = raw_seqcount_begin(&devnet_rename_seq);
 845         rcu_read_lock();
 846         dev = dev_get_by_index_rcu(net, ifindex);
 847         if (!dev) {
 848                 rcu_read_unlock();
 849                 return -ENODEV;
 850         }
 851
 852         strcpy(name, dev->name);
 853         rcu_read_unlock();
 854         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 855                 cond_resched();
 856                 goto retry;
 857         }
 858
 859         return 0;
 860 }
 861
 862 /**
 863  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 864  *      @net: the applicable net namespace
 865  *      @type: media type of device
 866  *      @ha: hardware address
 867  *
 868  *      Search for an interface by MAC address. Returns NULL if the device
 869  *      is not found or a pointer to the device.
 870  *      The caller must hold RCU or RTNL.
 871  *      The returned device has not had its ref count increased
 872  *      and the caller must therefore be careful about locking
 873  *
 874  */
 875
 876 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 877                                        const char *ha)
 878 {
 879         struct net_device *dev;
 880
 881         for_each_netdev_rcu(net, dev)
 882                 if (dev->type == type &&
 883                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 884                         return dev;
 885
 886         return NULL;
 887 }
 888 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 889
 890 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 891 {
 892         struct net_device *dev;
 893
 894         ASSERT_RTNL();
 895         for_each_netdev(net, dev)
 896                 if (dev->type == type)
 897                         return dev;
 898
 899         return NULL;
 900 }
 901 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 902
 903 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 904 {
 905         struct net_device *dev, *ret = NULL;
 906
 907         rcu_read_lock();
 908         for_each_netdev_rcu(net, dev)
 909                 if (dev->type == type) {
 910                         dev_hold(dev);
 911                         ret = dev;
 912                         break;
 913                 }
 914         rcu_read_unlock();
 915         return ret;
 916 }
 917 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 918
 919 /**
 920  *      __dev_get_by_flags - find any device with given flags
 921  *      @net: the applicable net namespace
 922  *      @if_flags: IFF_* values
 923  *      @mask: bitmask of bits in if_flags to check
 924  *
 925  *      Search for any interface with the given flags. Returns NULL if a device
 926  *      is not found or a pointer to the device. Must be called inside
 927  *      rtnl_lock(), and result refcount is unchanged.
 928  */
 929
 930 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 931                                       unsigned short mask)
 932 {
 933         struct net_device *dev, *ret;
 934
 935         ASSERT_RTNL();
 936
 937         ret = NULL;
 938         for_each_netdev(net, dev) {
 939                 if (((dev->flags ^ if_flags) & mask) == 0) {
 940                         ret = dev;
 941                         break;
 942                 }
 943         }
 944         return ret;
 945 }
 946 EXPORT_SYMBOL(__dev_get_by_flags);
 947
 948 /**
 949  *      dev_valid_name - check if name is okay for network device
 950  *      @name: name string
 951  *
 952  *      Network device names need to be valid file names to
 953  *      to allow sysfs to work.  We also disallow any kind of
 954  *      whitespace.
 955  */
 956 bool dev_valid_name(const char *name)
 957 {
 958         if (*name == '\0')
 959                 return false;
 960         if (strlen(name) >= IFNAMSIZ)
 961                 return false;
 962         if (!strcmp(name, ".") || !strcmp(name, ".."))
 963                 return false;
 964
 965         while (*name) {
 966                 if (*name == '/' || *name == ':' || isspace(*name))
 967                         return false;
 968                 name++;
 969         }
 970         return true;
 971 }
 972 EXPORT_SYMBOL(dev_valid_name);
 973
 974 /**
 975  *      __dev_alloc_name - allocate a name for a device
 976  *      @net: network namespace to allocate the device name in
 977  *      @name: name format string
 978  *      @buf:  scratch buffer and result name string
 979  *
 980  *      Passed a format string - eg "lt%d" it will try and find a suitable
 981  *      id. It scans list of devices to build up a free map, then chooses
 982  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 983  *      while allocating the name and adding the device in order to avoid
 984  *      duplicates.
 985  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 986  *      Returns the number of the unit assigned or a negative errno code.
 987  */
 988
 989 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 990 {
 991         int i = 0;
 992         const char *p;
 993         const int max_netdevices = 8*PAGE_SIZE;
 994         unsigned long *inuse;
 995         struct net_device *d;
 996
 997         p = strnchr(name, IFNAMSIZ-1, '%');
 998         if (p) {
 999                 /*
1000                  * Verify the string as this thing may have come from
1001                  * the user.  There must be either one "%d" and no other "%"
1002                  * characters.
1003                  */
1004                 if (p[1] != 'd' || strchr(p + 2, '%'))
1005                         return -EINVAL;
1006
1007                 /* Use one page as a bit array of possible slots */
1008                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1009                 if (!inuse)
1010                         return -ENOMEM;
1011
1012                 for_each_netdev(net, d) {
1013                         if (!sscanf(d->name, name, &i))
1014                                 continue;
1015                         if (i < 0 || i >= max_netdevices)
1016                                 continue;
1017
1018                         /*  avoid cases where sscanf is not exact inverse of printf */
1019                         snprintf(buf, IFNAMSIZ, name, i);
1020                         if (!strncmp(buf, d->name, IFNAMSIZ))
1021                                 set_bit(i, inuse);
1022                 }
1023
1024                 i = find_first_zero_bit(inuse, max_netdevices);
1025                 free_page((unsigned long) inuse);
1026         }
1027
1028         if (buf != name)
1029                 snprintf(buf, IFNAMSIZ, name, i);
1030         if (!__dev_get_by_name(net, buf))
1031                 return i;
1032
1033         /* It is possible to run out of possible slots
1034          * when the name is long and there isn't enough space left
1035          * for the digits, or if all bits are used.
1036          */
1037         return -ENFILE;
1038 }
1039
1040 /**
1041  *      dev_alloc_name - allocate a name for a device
1042  *      @dev: device
1043  *      @name: name format string
1044  *
1045  *      Passed a format string - eg "lt%d" it will try and find a suitable
1046  *      id. It scans list of devices to build up a free map, then chooses
1047  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1048  *      while allocating the name and adding the device in order to avoid
1049  *      duplicates.
1050  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1051  *      Returns the number of the unit assigned or a negative errno code.
1052  */
1053
1054 int dev_alloc_name(struct net_device *dev, const char *name)
1055 {
1056         char buf[IFNAMSIZ];
1057         struct net *net;
1058         int ret;
1059
1060         BUG_ON(!dev_net(dev));
1061         net = dev_net(dev);
1062         ret = __dev_alloc_name(net, name, buf);
1063         if (ret >= 0)
1064                 strlcpy(dev->name, buf, IFNAMSIZ);
1065         return ret;
1066 }
1067 EXPORT_SYMBOL(dev_alloc_name);
1068
1069 static int dev_alloc_name_ns(struct net *net,
1070                              struct net_device *dev,
1071                              const char *name)
1072 {
1073         char buf[IFNAMSIZ];
1074         int ret;
1075
1076         ret = __dev_alloc_name(net, name, buf);
1077         if (ret >= 0)
1078                 strlcpy(dev->name, buf, IFNAMSIZ);
1079         return ret;
1080 }
1081
1082 static int dev_get_valid_name(struct net *net,
1083                               struct net_device *dev,
1084                               const char *name)
1085 {
1086         BUG_ON(!net);
1087
1088         if (!dev_valid_name(name))
1089                 return -EINVAL;
1090
1091         if (strchr(name, '%'))
1092                 return dev_alloc_name_ns(net, dev, name);
1093         else if (__dev_get_by_name(net, name))
1094                 return -EEXIST;
1095         else if (dev->name != name)
1096                 strlcpy(dev->name, name, IFNAMSIZ);
1097
1098         return 0;
1099 }
1100
1101 /**
1102  *      dev_change_name - change name of a device
1103  *      @dev: device
1104  *      @newname: name (or format string) must be at least IFNAMSIZ
1105  *
1106  *      Change name of a device, can pass format strings "eth%d".
1107  *      for wildcarding.
1108  */
1109 int dev_change_name(struct net_device *dev, const char *newname)
1110 {
1111         unsigned char old_assign_type;
1112         char oldname[IFNAMSIZ];
1113         int err = 0;
1114         int ret;
1115         struct net *net;
1116
1117         ASSERT_RTNL();
1118         BUG_ON(!dev_net(dev));
1119
1120         net = dev_net(dev);
1121         if (dev->flags & IFF_UP)
1122                 return -EBUSY;
1123
1124         write_seqcount_begin(&devnet_rename_seq);
1125
1126         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1127                 write_seqcount_end(&devnet_rename_seq);
1128                 return 0;
1129         }
1130
1131         memcpy(oldname, dev->name, IFNAMSIZ);
1132
1133         err = dev_get_valid_name(net, dev, newname);
1134         if (err < 0) {
1135                 write_seqcount_end(&devnet_rename_seq);
1136                 return err;
1137         }
1138
1139         if (oldname[0] && !strchr(oldname, '%'))
1140                 netdev_info(dev, "renamed from %s\n", oldname);
1141
1142         old_assign_type = dev->name_assign_type;
1143         dev->name_assign_type = NET_NAME_RENAMED;
1144
1145 rollback:
1146         ret = device_rename(&dev->dev, dev->name);
1147         if (ret) {
1148                 memcpy(dev->name, oldname, IFNAMSIZ);
1149                 dev->name_assign_type = old_assign_type;
1150                 write_seqcount_end(&devnet_rename_seq);
1151                 return ret;
1152         }
1153
1154         write_seqcount_end(&devnet_rename_seq);
1155
1156         netdev_adjacent_rename_links(dev, oldname);
1157
1158         write_lock_bh(&dev_base_lock);
1159         hlist_del_rcu(&dev->name_hlist);
1160         write_unlock_bh(&dev_base_lock);
1161
1162         synchronize_rcu();
1163
1164         write_lock_bh(&dev_base_lock);
1165         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1166         write_unlock_bh(&dev_base_lock);
1167
1168         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1169         ret = notifier_to_errno(ret);
1170
1171         if (ret) {
1172                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1173                 if (err >= 0) {
1174                         err = ret;
1175                         write_seqcount_begin(&devnet_rename_seq);
1176                         memcpy(dev->name, oldname, IFNAMSIZ);
1177                         memcpy(oldname, newname, IFNAMSIZ);
1178                         dev->name_assign_type = old_assign_type;
1179                         old_assign_type = NET_NAME_RENAMED;
1180                         goto rollback;
1181                 } else {
1182                         pr_err("%s: name change rollback failed: %d\n",
1183                                dev->name, ret);
1184                 }
1185         }
1186
1187         return err;
1188 }
1189
1190 /**
1191  *      dev_set_alias - change ifalias of a device
1192  *      @dev: device
1193  *      @alias: name up to IFALIASZ
1194  *      @len: limit of bytes to copy from info
1195  *
1196  *      Set ifalias for a device,
1197  */
1198 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1199 {
1200         char *new_ifalias;
1201
1202         ASSERT_RTNL();
1203
1204         if (len >= IFALIASZ)
1205                 return -EINVAL;
1206
1207         if (!len) {
1208                 kfree(dev->ifalias);
1209                 dev->ifalias = NULL;
1210                 return 0;
1211         }
1212
1213         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1214         if (!new_ifalias)
1215                 return -ENOMEM;
1216         dev->ifalias = new_ifalias;
1217
1218         strlcpy(dev->ifalias, alias, len+1);
1219         return len;
1220 }
1221
1222
1223 /**
1224  *      netdev_features_change - device changes features
1225  *      @dev: device to cause notification
1226  *
1227  *      Called to indicate a device has changed features.
1228  */
1229 void netdev_features_change(struct net_device *dev)
1230 {
1231         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1232 }
1233 EXPORT_SYMBOL(netdev_features_change);
1234
1235 /**
1236  *      netdev_state_change - device changes state
1237  *      @dev: device to cause notification
1238  *
1239  *      Called to indicate a device has changed state. This function calls
1240  *      the notifier chains for netdev_chain and sends a NEWLINK message
1241  *      to the routing socket.
1242  */
1243 void netdev_state_change(struct net_device *dev)
1244 {
1245         if (dev->flags & IFF_UP) {
1246                 struct netdev_notifier_change_info change_info;
1247
1248                 change_info.flags_changed = 0;
1249                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1250                                               &change_info.info);
1251                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1252         }
1253 }
1254 EXPORT_SYMBOL(netdev_state_change);
1255
1256 /**
1257  *      netdev_notify_peers - notify network peers about existence of @dev
1258  *      @dev: network device
1259  *
1260  * Generate traffic such that interested network peers are aware of
1261  * @dev, such as by generating a gratuitous ARP. This may be used when
1262  * a device wants to inform the rest of the network about some sort of
1263  * reconfiguration such as a failover event or virtual machine
1264  * migration.
1265  */
1266 void netdev_notify_peers(struct net_device *dev)
1267 {
1268         rtnl_lock();
1269         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1270         rtnl_unlock();
1271 }
1272 EXPORT_SYMBOL(netdev_notify_peers);
1273
1274 static int __dev_open(struct net_device *dev)
1275 {
1276         const struct net_device_ops *ops = dev->netdev_ops;
1277         int ret;
1278
1279         ASSERT_RTNL();
1280
1281         if (!netif_device_present(dev))
1282                 return -ENODEV;
1283
1284         /* Block netpoll from trying to do any rx path servicing.
1285          * If we don't do this there is a chance ndo_poll_controller
1286          * or ndo_poll may be running while we open the device
1287          */
1288         netpoll_poll_disable(dev);
1289
1290         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1291         ret = notifier_to_errno(ret);
1292         if (ret)
1293                 return ret;
1294
1295         set_bit(__LINK_STATE_START, &dev->state);
1296
1297         if (ops->ndo_validate_addr)
1298                 ret = ops->ndo_validate_addr(dev);
1299
1300         if (!ret && ops->ndo_open)
1301                 ret = ops->ndo_open(dev);
1302
1303         netpoll_poll_enable(dev);
1304
1305         if (ret)
1306                 clear_bit(__LINK_STATE_START, &dev->state);
1307         else {
1308                 dev->flags |= IFF_UP;
1309                 dev_set_rx_mode(dev);
1310                 dev_activate(dev);
1311                 add_device_randomness(dev->dev_addr, dev->addr_len);
1312         }
1313
1314         return ret;
1315 }
1316
1317 /**
1318  *      dev_open        - prepare an interface for use.
1319  *      @dev:   device to open
1320  *
1321  *      Takes a device from down to up state. The device's private open
1322  *      function is invoked and then the multicast lists are loaded. Finally
1323  *      the device is moved into the up state and a %NETDEV_UP message is
1324  *      sent to the netdev notifier chain.
1325  *
1326  *      Calling this function on an active interface is a nop. On a failure
1327  *      a negative errno code is returned.
1328  */
1329 int dev_open(struct net_device *dev)
1330 {
1331         int ret;
1332
1333         if (dev->flags & IFF_UP)
1334                 return 0;
1335
1336         ret = __dev_open(dev);
1337         if (ret < 0)
1338                 return ret;
1339
1340         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1341         call_netdevice_notifiers(NETDEV_UP, dev);
1342
1343         return ret;
1344 }
1345 EXPORT_SYMBOL(dev_open);
1346
1347 static int __dev_close_many(struct list_head *head)
1348 {
1349         struct net_device *dev;
1350
1351         ASSERT_RTNL();
1352         might_sleep();
1353
1354         list_for_each_entry(dev, head, close_list) {
1355                 /* Temporarily disable netpoll until the interface is down */
1356                 netpoll_poll_disable(dev);
1357
1358                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1359
1360                 clear_bit(__LINK_STATE_START, &dev->state);
1361
1362                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1363                  * can be even on different cpu. So just clear netif_running().
1364                  *
1365                  * dev->stop() will invoke napi_disable() on all of it's
1366                  * napi_struct instances on this device.
1367                  */
1368                 smp_mb__after_atomic(); /* Commit netif_running(). */
1369         }
1370
1371         dev_deactivate_many(head);
1372
1373         list_for_each_entry(dev, head, close_list) {
1374                 const struct net_device_ops *ops = dev->netdev_ops;
1375
1376                 /*
1377                  *      Call the device specific close. This cannot fail.
1378                  *      Only if device is UP
1379                  *
1380                  *      We allow it to be called even after a DETACH hot-plug
1381                  *      event.
1382                  */
1383                 if (ops->ndo_stop)
1384                         ops->ndo_stop(dev);
1385
1386                 dev->flags &= ~IFF_UP;
1387                 netpoll_poll_enable(dev);
1388         }
1389
1390         return 0;
1391 }
1392
1393 static int __dev_close(struct net_device *dev)
1394 {
1395         int retval;
1396         LIST_HEAD(single);
1397
1398         list_add(&dev->close_list, &single);
1399         retval = __dev_close_many(&single);
1400         list_del(&single);
1401
1402         return retval;
1403 }
1404
1405 int dev_close_many(struct list_head *head, bool unlink)
1406 {
1407         struct net_device *dev, *tmp;
1408
1409         /* Remove the devices that don't need to be closed */
1410         list_for_each_entry_safe(dev, tmp, head, close_list)
1411                 if (!(dev->flags & IFF_UP))
1412                         list_del_init(&dev->close_list);
1413
1414         __dev_close_many(head);
1415
1416         list_for_each_entry_safe(dev, tmp, head, close_list) {
1417                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1418                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1419                 if (unlink)
1420                         list_del_init(&dev->close_list);
1421         }
1422
1423         return 0;
1424 }
1425 EXPORT_SYMBOL(dev_close_many);
1426
1427 /**
1428  *      dev_close - shutdown an interface.
1429  *      @dev: device to shutdown
1430  *
1431  *      This function moves an active device into down state. A
1432  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1433  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1434  *      chain.
1435  */
1436 int dev_close(struct net_device *dev)
1437 {
1438         if (dev->flags & IFF_UP) {
1439                 LIST_HEAD(single);
1440
1441                 list_add(&dev->close_list, &single);
1442                 dev_close_many(&single, true);
1443                 list_del(&single);
1444         }
1445         return 0;
1446 }
1447 EXPORT_SYMBOL(dev_close);
1448
1449
1450 /**
1451  *      dev_disable_lro - disable Large Receive Offload on a device
1452  *      @dev: device
1453  *
1454  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1455  *      called under RTNL.  This is needed if received packets may be
1456  *      forwarded to another interface.
1457  */
1458 void dev_disable_lro(struct net_device *dev)
1459 {
1460         struct net_device *lower_dev;
1461         struct list_head *iter;
1462
1463         dev->wanted_features &= ~NETIF_F_LRO;
1464         netdev_update_features(dev);
1465
1466         if (unlikely(dev->features & NETIF_F_LRO))
1467                 netdev_WARN(dev, "failed to disable LRO!\n");
1468
1469         netdev_for_each_lower_dev(dev, lower_dev, iter)
1470                 dev_disable_lro(lower_dev);
1471 }
1472 EXPORT_SYMBOL(dev_disable_lro);
1473
1474 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1475                                    struct net_device *dev)
1476 {
1477         struct netdev_notifier_info info;
1478
1479         netdev_notifier_info_init(&info, dev);
1480         return nb->notifier_call(nb, val, &info);
1481 }
1482
1483 static int dev_boot_phase = 1;
1484
1485 /**
1486  *      register_netdevice_notifier - register a network notifier block
1487  *      @nb: notifier
1488  *
1489  *      Register a notifier to be called when network device events occur.
1490  *      The notifier passed is linked into the kernel structures and must
1491  *      not be reused until it has been unregistered. A negative errno code
1492  *      is returned on a failure.
1493  *
1494  *      When registered all registration and up events are replayed
1495  *      to the new notifier to allow device to have a race free
1496  *      view of the network device list.
1497  */
1498
1499 int register_netdevice_notifier(struct notifier_block *nb)
1500 {
1501         struct net_device *dev;
1502         struct net_device *last;
1503         struct net *net;
1504         int err;
1505
1506         rtnl_lock();
1507         err = raw_notifier_chain_register(&netdev_chain, nb);
1508         if (err)
1509                 goto unlock;
1510         if (dev_boot_phase)
1511                 goto unlock;
1512         for_each_net(net) {
1513                 for_each_netdev(net, dev) {
1514                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1515                         err = notifier_to_errno(err);
1516                         if (err)
1517                                 goto rollback;
1518
1519                         if (!(dev->flags & IFF_UP))
1520                                 continue;
1521
1522                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1523                 }
1524         }
1525
1526 unlock:
1527         rtnl_unlock();
1528         return err;
1529
1530 rollback:
1531         last = dev;
1532         for_each_net(net) {
1533                 for_each_netdev(net, dev) {
1534                         if (dev == last)
1535                                 goto outroll;
1536
1537                         if (dev->flags & IFF_UP) {
1538                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1539                                                         dev);
1540                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1541                         }
1542                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1543                 }
1544         }
1545
1546 outroll:
1547         raw_notifier_chain_unregister(&netdev_chain, nb);
1548         goto unlock;
1549 }
1550 EXPORT_SYMBOL(register_netdevice_notifier);
1551
1552 /**
1553  *      unregister_netdevice_notifier - unregister a network notifier block
1554  *      @nb: notifier
1555  *
1556  *      Unregister a notifier previously registered by
1557  *      register_netdevice_notifier(). The notifier is unlinked into the
1558  *      kernel structures and may then be reused. A negative errno code
1559  *      is returned on a failure.
1560  *
1561  *      After unregistering unregister and down device events are synthesized
1562  *      for all devices on the device list to the removed notifier to remove
1563  *      the need for special case cleanup code.
1564  */
1565
1566 int unregister_netdevice_notifier(struct notifier_block *nb)
1567 {
1568         struct net_device *dev;
1569         struct net *net;
1570         int err;
1571
1572         rtnl_lock();
1573         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1574         if (err)
1575                 goto unlock;
1576
1577         for_each_net(net) {
1578                 for_each_netdev(net, dev) {
1579                         if (dev->flags & IFF_UP) {
1580                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1581                                                         dev);
1582                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1583                         }
1584                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1585                 }
1586         }
1587 unlock:
1588         rtnl_unlock();
1589         return err;
1590 }
1591 EXPORT_SYMBOL(unregister_netdevice_notifier);
1592
1593 /**
1594  *      call_netdevice_notifiers_info - call all network notifier blocks
1595  *      @val: value passed unmodified to notifier function
1596  *      @dev: net_device pointer passed unmodified to notifier function
1597  *      @info: notifier information data
1598  *
1599  *      Call all network notifier blocks.  Parameters and return value
1600  *      are as for raw_notifier_call_chain().
1601  */
1602
1603 static int call_netdevice_notifiers_info(unsigned long val,
1604                                          struct net_device *dev,
1605                                          struct netdev_notifier_info *info)
1606 {
1607         ASSERT_RTNL();
1608         netdev_notifier_info_init(info, dev);
1609         return raw_notifier_call_chain(&netdev_chain, val, info);
1610 }
1611
1612 /**
1613  *      call_netdevice_notifiers - call all network notifier blocks
1614  *      @val: value passed unmodified to notifier function
1615  *      @dev: net_device pointer passed unmodified to notifier function
1616  *
1617  *      Call all network notifier blocks.  Parameters and return value
1618  *      are as for raw_notifier_call_chain().
1619  */
1620
1621 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1622 {
1623         struct netdev_notifier_info info;
1624
1625         return call_netdevice_notifiers_info(val, dev, &info);
1626 }
1627 EXPORT_SYMBOL(call_netdevice_notifiers);
1628
1629 static struct static_key netstamp_needed __read_mostly;
1630 #ifdef HAVE_JUMP_LABEL
1631 /* We are not allowed to call static_key_slow_dec() from irq context
1632  * If net_disable_timestamp() is called from irq context, defer the
1633  * static_key_slow_dec() calls.
1634  */
1635 static atomic_t netstamp_needed_deferred;
1636 #endif
1637
1638 void net_enable_timestamp(void)
1639 {
1640 #ifdef HAVE_JUMP_LABEL
1641         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1642
1643         if (deferred) {
1644                 while (--deferred)
1645                         static_key_slow_dec(&netstamp_needed);
1646                 return;
1647         }
1648 #endif
1649         static_key_slow_inc(&netstamp_needed);
1650 }
1651 EXPORT_SYMBOL(net_enable_timestamp);
1652
1653 void net_disable_timestamp(void)
1654 {
1655 #ifdef HAVE_JUMP_LABEL
1656         if (in_interrupt()) {
1657                 atomic_inc(&netstamp_needed_deferred);
1658                 return;
1659         }
1660 #endif
1661         static_key_slow_dec(&netstamp_needed);
1662 }
1663 EXPORT_SYMBOL(net_disable_timestamp);
1664
1665 static inline void net_timestamp_set(struct sk_buff *skb)
1666 {
1667         skb->tstamp.tv64 = 0;
1668         if (static_key_false(&netstamp_needed))
1669                 __net_timestamp(skb);
1670 }
1671
1672 #define net_timestamp_check(COND, SKB)                  \
1673         if (static_key_false(&netstamp_needed)) {               \
1674                 if ((COND) && !(SKB)->tstamp.tv64)      \
1675                         __net_timestamp(SKB);           \
1676         }                                               \
1677
1678 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1679 {
1680         unsigned int len;
1681
1682         if (!(dev->flags & IFF_UP))
1683                 return false;
1684
1685         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1686         if (skb->len <= len)
1687                 return true;
1688
1689         /* if TSO is enabled, we don't care about the length as the packet
1690          * could be forwarded without being segmented before
1691          */
1692         if (skb_is_gso(skb))
1693                 return true;
1694
1695         return false;
1696 }
1697 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1698
1699 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1700 {
1701         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1702                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1703                         atomic_long_inc(&dev->rx_dropped);
1704                         kfree_skb(skb);
1705                         return NET_RX_DROP;
1706                 }
1707         }
1708
1709         if (unlikely(!is_skb_forwardable(dev, skb))) {
1710                 atomic_long_inc(&dev->rx_dropped);
1711                 kfree_skb(skb);
1712                 return NET_RX_DROP;
1713         }
1714
1715         skb_scrub_packet(skb, true);
1716         skb->priority = 0;
1717         skb->protocol = eth_type_trans(skb, dev);
1718         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1719
1720         return 0;
1721 }
1722 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1723
1724 /**
1725  * dev_forward_skb - loopback an skb to another netif
1726  *
1727  * @dev: destination network device
1728  * @skb: buffer to forward
1729  *
1730  * return values:
1731  *      NET_RX_SUCCESS  (no congestion)
1732  *      NET_RX_DROP     (packet was dropped, but freed)
1733  *
1734  * dev_forward_skb can be used for injecting an skb from the
1735  * start_xmit function of one device into the receive queue
1736  * of another device.
1737  *
1738  * The receiving device may be in another namespace, so
1739  * we have to clear all information in the skb that could
1740  * impact namespace isolation.
1741  */
1742 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1743 {
1744         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1745 }
1746 EXPORT_SYMBOL_GPL(dev_forward_skb);
1747
1748 static inline int deliver_skb(struct sk_buff *skb,
1749                               struct packet_type *pt_prev,
1750                               struct net_device *orig_dev)
1751 {
1752         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1753                 return -ENOMEM;
1754         atomic_inc(&skb->users);
1755         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1756 }
1757
1758 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1759                                           struct packet_type **pt,
1760                                           struct net_device *orig_dev,
1761                                           __be16 type,
1762                                           struct list_head *ptype_list)
1763 {
1764         struct packet_type *ptype, *pt_prev = *pt;
1765
1766         list_for_each_entry_rcu(ptype, ptype_list, list) {
1767                 if (ptype->type != type)
1768                         continue;
1769                 if (pt_prev)
1770                         deliver_skb(skb, pt_prev, orig_dev);
1771                 pt_prev = ptype;
1772         }
1773         *pt = pt_prev;
1774 }
1775
1776 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1777 {
1778         if (!ptype->af_packet_priv || !skb->sk)
1779                 return false;
1780
1781         if (ptype->id_match)
1782                 return ptype->id_match(ptype, skb->sk);
1783         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1784                 return true;
1785
1786         return false;
1787 }
1788
1789 /*
1790  *      Support routine. Sends outgoing frames to any network
1791  *      taps currently in use.
1792  */
1793
1794 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1795 {
1796         struct packet_type *ptype;
1797         struct sk_buff *skb2 = NULL;
1798         struct packet_type *pt_prev = NULL;
1799         struct list_head *ptype_list = &ptype_all;
1800
1801         rcu_read_lock();
1802 again:
1803         list_for_each_entry_rcu(ptype, ptype_list, list) {
1804                 /* Never send packets back to the socket
1805                  * they originated from - MvS (miquels@drinkel.ow.org)
1806                  */
1807                 if (skb_loop_sk(ptype, skb))
1808                         continue;
1809
1810                 if (pt_prev) {
1811                         deliver_skb(skb2, pt_prev, skb->dev);
1812                         pt_prev = ptype;
1813                         continue;
1814                 }
1815
1816                 /* need to clone skb, done only once */
1817                 skb2 = skb_clone(skb, GFP_ATOMIC);
1818                 if (!skb2)
1819                         goto out_unlock;
1820
1821                 net_timestamp_set(skb2);
1822
1823                 /* skb->nh should be correctly
1824                  * set by sender, so that the second statement is
1825                  * just protection against buggy protocols.
1826                  */
1827                 skb_reset_mac_header(skb2);
1828
1829                 if (skb_network_header(skb2) < skb2->data ||
1830                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1831                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1832                                              ntohs(skb2->protocol),
1833                                              dev->name);
1834                         skb_reset_network_header(skb2);
1835                 }
1836
1837                 skb2->transport_header = skb2->network_header;
1838                 skb2->pkt_type = PACKET_OUTGOING;
1839                 pt_prev = ptype;
1840         }
1841
1842         if (ptype_list == &ptype_all) {
1843                 ptype_list = &dev->ptype_all;
1844                 goto again;
1845         }
1846 out_unlock:
1847         if (pt_prev)
1848                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1849         rcu_read_unlock();
1850 }
1851
1852 /**
1853  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1854  * @dev: Network device
1855  * @txq: number of queues available
1856  *
1857  * If real_num_tx_queues is changed the tc mappings may no longer be
1858  * valid. To resolve this verify the tc mapping remains valid and if
1859  * not NULL the mapping. With no priorities mapping to this
1860  * offset/count pair it will no longer be used. In the worst case TC0
1861  * is invalid nothing can be done so disable priority mappings. If is
1862  * expected that drivers will fix this mapping if they can before
1863  * calling netif_set_real_num_tx_queues.
1864  */
1865 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1866 {
1867         int i;
1868         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1869
1870         /* If TC0 is invalidated disable TC mapping */
1871         if (tc->offset + tc->count > txq) {
1872                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1873                 dev->num_tc = 0;
1874                 return;
1875         }
1876
1877         /* Invalidated prio to tc mappings set to TC0 */
1878         for (i = 1; i < TC_BITMASK + 1; i++) {
1879                 int q = netdev_get_prio_tc_map(dev, i);
1880
1881                 tc = &dev->tc_to_txq[q];
1882                 if (tc->offset + tc->count > txq) {
1883                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1884                                 i, q);
1885                         netdev_set_prio_tc_map(dev, i, 0);
1886                 }
1887         }
1888 }
1889
1890 #ifdef CONFIG_XPS
1891 static DEFINE_MUTEX(xps_map_mutex);
1892 #define xmap_dereference(P)             \
1893         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1894
1895 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1896                                         int cpu, u16 index)
1897 {
1898         struct xps_map *map = NULL;
1899         int pos;
1900
1901         if (dev_maps)
1902                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1903
1904         for (pos = 0; map && pos < map->len; pos++) {
1905                 if (map->queues[pos] == index) {
1906                         if (map->len > 1) {
1907                                 map->queues[pos] = map->queues[--map->len];
1908                         } else {
1909                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1910                                 kfree_rcu(map, rcu);
1911                                 map = NULL;
1912                         }
1913                         break;
1914                 }
1915         }
1916
1917         return map;
1918 }
1919
1920 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1921 {
1922         struct xps_dev_maps *dev_maps;
1923         int cpu, i;
1924         bool active = false;
1925
1926         mutex_lock(&xps_map_mutex);
1927         dev_maps = xmap_dereference(dev->xps_maps);
1928
1929         if (!dev_maps)
1930                 goto out_no_maps;
1931
1932         for_each_possible_cpu(cpu) {
1933                 for (i = index; i < dev->num_tx_queues; i++) {
1934                         if (!remove_xps_queue(dev_maps, cpu, i))
1935                                 break;
1936                 }
1937                 if (i == dev->num_tx_queues)
1938                         active = true;
1939         }
1940
1941         if (!active) {
1942                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1943                 kfree_rcu(dev_maps, rcu);
1944         }
1945
1946         for (i = index; i < dev->num_tx_queues; i++)
1947                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1948                                              NUMA_NO_NODE);
1949
1950 out_no_maps:
1951         mutex_unlock(&xps_map_mutex);
1952 }
1953
1954 static struct xps_map *expand_xps_map(struct xps_map *map,
1955                                       int cpu, u16 index)
1956 {
1957         struct xps_map *new_map;
1958         int alloc_len = XPS_MIN_MAP_ALLOC;
1959         int i, pos;
1960
1961         for (pos = 0; map && pos < map->len; pos++) {
1962                 if (map->queues[pos] != index)
1963                         continue;
1964                 return map;
1965         }
1966
1967         /* Need to add queue to this CPU's existing map */
1968         if (map) {
1969                 if (pos < map->alloc_len)
1970                         return map;
1971
1972                 alloc_len = map->alloc_len * 2;
1973         }
1974
1975         /* Need to allocate new map to store queue on this CPU's map */
1976         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1977                                cpu_to_node(cpu));
1978         if (!new_map)
1979                 return NULL;
1980
1981         for (i = 0; i < pos; i++)
1982                 new_map->queues[i] = map->queues[i];
1983         new_map->alloc_len = alloc_len;
1984         new_map->len = pos;
1985
1986         return new_map;
1987 }
1988
1989 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1990                         u16 index)
1991 {
1992         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1993         struct xps_map *map, *new_map;
1994         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1995         int cpu, numa_node_id = -2;
1996         bool active = false;
1997
1998         mutex_lock(&xps_map_mutex);
1999
2000         dev_maps = xmap_dereference(dev->xps_maps);
2001
2002         /* allocate memory for queue storage */
2003         for_each_online_cpu(cpu) {
2004                 if (!cpumask_test_cpu(cpu, mask))
2005                         continue;
2006
2007                 if (!new_dev_maps)
2008                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2009                 if (!new_dev_maps) {
2010                         mutex_unlock(&xps_map_mutex);
2011                         return -ENOMEM;
2012                 }
2013
2014                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2015                                  NULL;
2016
2017                 map = expand_xps_map(map, cpu, index);
2018                 if (!map)
2019                         goto error;
2020
2021                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2022         }
2023
2024         if (!new_dev_maps)
2025                 goto out_no_new_maps;
2026
2027         for_each_possible_cpu(cpu) {
2028                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2029                         /* add queue to CPU maps */
2030                         int pos = 0;
2031
2032                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2033                         while ((pos < map->len) && (map->queues[pos] != index))
2034                                 pos++;
2035
2036                         if (pos == map->len)
2037                                 map->queues[map->len++] = index;
2038 #ifdef CONFIG_NUMA
2039                         if (numa_node_id == -2)
2040                                 numa_node_id = cpu_to_node(cpu);
2041                         else if (numa_node_id != cpu_to_node(cpu))
2042                                 numa_node_id = -1;
2043 #endif
2044                 } else if (dev_maps) {
2045                         /* fill in the new device map from the old device map */
2046                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2047                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2048                 }
2049
2050         }
2051
2052         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2053
2054         /* Cleanup old maps */
2055         if (dev_maps) {
2056                 for_each_possible_cpu(cpu) {
2057                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2058                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2059                         if (map && map != new_map)
2060                                 kfree_rcu(map, rcu);
2061                 }
2062
2063                 kfree_rcu(dev_maps, rcu);
2064         }
2065
2066         dev_maps = new_dev_maps;
2067         active = true;
2068
2069 out_no_new_maps:
2070         /* update Tx queue numa node */
2071         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2072                                      (numa_node_id >= 0) ? numa_node_id :
2073                                      NUMA_NO_NODE);
2074
2075         if (!dev_maps)
2076                 goto out_no_maps;
2077
2078         /* removes queue from unused CPUs */
2079         for_each_possible_cpu(cpu) {
2080                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2081                         continue;
2082
2083                 if (remove_xps_queue(dev_maps, cpu, index))
2084                         active = true;
2085         }
2086
2087         /* free map if not active */
2088         if (!active) {
2089                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2090                 kfree_rcu(dev_maps, rcu);
2091         }
2092
2093 out_no_maps:
2094         mutex_unlock(&xps_map_mutex);
2095
2096         return 0;
2097 error:
2098         /* remove any maps that we added */
2099         for_each_possible_cpu(cpu) {
2100                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2101                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2102                                  NULL;
2103                 if (new_map && new_map != map)
2104                         kfree(new_map);
2105         }
2106
2107         mutex_unlock(&xps_map_mutex);
2108
2109         kfree(new_dev_maps);
2110         return -ENOMEM;
2111 }
2112 EXPORT_SYMBOL(netif_set_xps_queue);
2113
2114 #endif
2115 /*
2116  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2117  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2118  */
2119 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2120 {
2121         int rc;
2122
2123         if (txq < 1 || txq > dev->num_tx_queues)
2124                 return -EINVAL;
2125
2126         if (dev->reg_state == NETREG_REGISTERED ||
2127             dev->reg_state == NETREG_UNREGISTERING) {
2128                 ASSERT_RTNL();
2129
2130                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2131                                                   txq);
2132                 if (rc)
2133                         return rc;
2134
2135                 if (dev->num_tc)
2136                         netif_setup_tc(dev, txq);
2137
2138                 if (txq < dev->real_num_tx_queues) {
2139                         qdisc_reset_all_tx_gt(dev, txq);
2140 #ifdef CONFIG_XPS
2141                         netif_reset_xps_queues_gt(dev, txq);
2142 #endif
2143                 }
2144         }
2145
2146         dev->real_num_tx_queues = txq;
2147         return 0;
2148 }
2149 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2150
2151 #ifdef CONFIG_SYSFS
2152 /**
2153  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2154  *      @dev: Network device
2155  *      @rxq: Actual number of RX queues
2156  *
2157  *      This must be called either with the rtnl_lock held or before
2158  *      registration of the net device.  Returns 0 on success, or a
2159  *      negative error code.  If called before registration, it always
2160  *      succeeds.
2161  */
2162 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2163 {
2164         int rc;
2165
2166         if (rxq < 1 || rxq > dev->num_rx_queues)
2167                 return -EINVAL;
2168
2169         if (dev->reg_state == NETREG_REGISTERED) {
2170                 ASSERT_RTNL();
2171
2172                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2173                                                   rxq);
2174                 if (rc)
2175                         return rc;
2176         }
2177
2178         dev->real_num_rx_queues = rxq;
2179         return 0;
2180 }
2181 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2182 #endif
2183
2184 /**
2185  * netif_get_num_default_rss_queues - default number of RSS queues
2186  *
2187  * This routine should set an upper limit on the number of RSS queues
2188  * used by default by multiqueue devices.
2189  */
2190 int netif_get_num_default_rss_queues(void)
2191 {
2192         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2193 }
2194 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2195
2196 static inline void __netif_reschedule(struct Qdisc *q)
2197 {
2198         struct softnet_data *sd;
2199         unsigned long flags;
2200
2201         local_irq_save(flags);
2202         sd = this_cpu_ptr(&softnet_data);
2203         q->next_sched = NULL;
2204         *sd->output_queue_tailp = q;
2205         sd->output_queue_tailp = &q->next_sched;
2206         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2207         local_irq_restore(flags);
2208 }
2209
2210 void __netif_schedule(struct Qdisc *q)
2211 {
2212         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2213                 __netif_reschedule(q);
2214 }
2215 EXPORT_SYMBOL(__netif_schedule);
2216
2217 struct dev_kfree_skb_cb {
2218         enum skb_free_reason reason;
2219 };
2220
2221 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2222 {
2223         return (struct dev_kfree_skb_cb *)skb->cb;
2224 }
2225
2226 void netif_schedule_queue(struct netdev_queue *txq)
2227 {
2228         rcu_read_lock();
2229         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2230                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2231
2232                 __netif_schedule(q);
2233         }
2234         rcu_read_unlock();
2235 }
2236 EXPORT_SYMBOL(netif_schedule_queue);
2237
2238 /**
2239  *      netif_wake_subqueue - allow sending packets on subqueue
2240  *      @dev: network device
2241  *      @queue_index: sub queue index
2242  *
2243  * Resume individual transmit queue of a device with multiple transmit queues.
2244  */
2245 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2246 {
2247         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2248
2249         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2250                 struct Qdisc *q;
2251
2252                 rcu_read_lock();
2253                 q = rcu_dereference(txq->qdisc);
2254                 __netif_schedule(q);
2255                 rcu_read_unlock();
2256         }
2257 }
2258 EXPORT_SYMBOL(netif_wake_subqueue);
2259
2260 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2261 {
2262         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2263                 struct Qdisc *q;
2264
2265                 rcu_read_lock();
2266                 q = rcu_dereference(dev_queue->qdisc);
2267                 __netif_schedule(q);
2268                 rcu_read_unlock();
2269         }
2270 }
2271 EXPORT_SYMBOL(netif_tx_wake_queue);
2272
2273 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2274 {
2275         unsigned long flags;
2276
2277         if (likely(atomic_read(&skb->users) == 1)) {
2278                 smp_rmb();
2279                 atomic_set(&skb->users, 0);
2280         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2281                 return;
2282         }
2283         get_kfree_skb_cb(skb)->reason = reason;
2284         local_irq_save(flags);
2285         skb->next = __this_cpu_read(softnet_data.completion_queue);
2286         __this_cpu_write(softnet_data.completion_queue, skb);
2287         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2288         local_irq_restore(flags);
2289 }
2290 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2291
2292 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2293 {
2294         if (in_irq() || irqs_disabled())
2295                 __dev_kfree_skb_irq(skb, reason);
2296         else
2297                 dev_kfree_skb(skb);
2298 }
2299 EXPORT_SYMBOL(__dev_kfree_skb_any);
2300
2301
2302 /**
2303  * netif_device_detach - mark device as removed
2304  * @dev: network device
2305  *
2306  * Mark device as removed from system and therefore no longer available.
2307  */
2308 void netif_device_detach(struct net_device *dev)
2309 {
2310         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2311             netif_running(dev)) {
2312                 netif_tx_stop_all_queues(dev);
2313         }
2314 }
2315 EXPORT_SYMBOL(netif_device_detach);
2316
2317 /**
2318  * netif_device_attach - mark device as attached
2319  * @dev: network device
2320  *
2321  * Mark device as attached from system and restart if needed.
2322  */
2323 void netif_device_attach(struct net_device *dev)
2324 {
2325         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2326             netif_running(dev)) {
2327                 netif_tx_wake_all_queues(dev);
2328                 __netdev_watchdog_up(dev);
2329         }
2330 }
2331 EXPORT_SYMBOL(netif_device_attach);
2332
2333 static void skb_warn_bad_offload(const struct sk_buff *skb)
2334 {
2335         static const netdev_features_t null_features = 0;
2336         struct net_device *dev = skb->dev;
2337         const char *driver = "";
2338
2339         if (!net_ratelimit())
2340                 return;
2341
2342         if (dev && dev->dev.parent)
2343                 driver = dev_driver_string(dev->dev.parent);
2344
2345         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2346              "gso_type=%d ip_summed=%d\n",
2347              driver, dev ? &dev->features : &null_features,
2348              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2349              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2350              skb_shinfo(skb)->gso_type, skb->ip_summed);
2351 }
2352
2353 /*
2354  * Invalidate hardware checksum when packet is to be mangled, and
2355  * complete checksum manually on outgoing path.
2356  */
2357 int skb_checksum_help(struct sk_buff *skb)
2358 {
2359         __wsum csum;
2360         int ret = 0, offset;
2361
2362         if (skb->ip_summed == CHECKSUM_COMPLETE)
2363                 goto out_set_summed;
2364
2365         if (unlikely(skb_shinfo(skb)->gso_size)) {
2366                 skb_warn_bad_offload(skb);
2367                 return -EINVAL;
2368         }
2369
2370         /* Before computing a checksum, we should make sure no frag could
2371          * be modified by an external entity : checksum could be wrong.
2372          */
2373         if (skb_has_shared_frag(skb)) {
2374                 ret = __skb_linearize(skb);
2375                 if (ret)
2376                         goto out;
2377         }
2378
2379         offset = skb_checksum_start_offset(skb);
2380         BUG_ON(offset >= skb_headlen(skb));
2381         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2382
2383         offset += skb->csum_offset;
2384         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2385
2386         if (skb_cloned(skb) &&
2387             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2388                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2389                 if (ret)
2390                         goto out;
2391         }
2392
2393         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2394 out_set_summed:
2395         skb->ip_summed = CHECKSUM_NONE;
2396 out:
2397         return ret;
2398 }
2399 EXPORT_SYMBOL(skb_checksum_help);
2400
2401 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2402 {
2403         __be16 type = skb->protocol;
2404
2405         /* Tunnel gso handlers can set protocol to ethernet. */
2406         if (type == htons(ETH_P_TEB)) {
2407                 struct ethhdr *eth;
2408
2409                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2410                         return 0;
2411
2412                 eth = (struct ethhdr *)skb_mac_header(skb);
2413                 type = eth->h_proto;
2414         }
2415
2416         return __vlan_get_protocol(skb, type, depth);
2417 }
2418
2419 /**
2420  *      skb_mac_gso_segment - mac layer segmentation handler.
2421  *      @skb: buffer to segment
2422  *      @features: features for the output path (see dev->features)
2423  */
2424 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2425                                     netdev_features_t features)
2426 {
2427         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2428         struct packet_offload *ptype;
2429         int vlan_depth = skb->mac_len;
2430         __be16 type = skb_network_protocol(skb, &vlan_depth);
2431
2432         if (unlikely(!type))
2433                 return ERR_PTR(-EINVAL);
2434
2435         __skb_pull(skb, vlan_depth);
2436
2437         rcu_read_lock();
2438         list_for_each_entry_rcu(ptype, &offload_base, list) {
2439                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2440                         segs = ptype->callbacks.gso_segment(skb, features);
2441                         break;
2442                 }
2443         }
2444         rcu_read_unlock();
2445
2446         __skb_push(skb, skb->data - skb_mac_header(skb));
2447
2448         return segs;
2449 }
2450 EXPORT_SYMBOL(skb_mac_gso_segment);
2451
2452
2453 /* openvswitch calls this on rx path, so we need a different check.
2454  */
2455 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2456 {
2457         if (tx_path)
2458                 return skb->ip_summed != CHECKSUM_PARTIAL;
2459         else
2460                 return skb->ip_summed == CHECKSUM_NONE;
2461 }
2462
2463 /**
2464  *      __skb_gso_segment - Perform segmentation on skb.
2465  *      @skb: buffer to segment
2466  *      @features: features for the output path (see dev->features)
2467  *      @tx_path: whether it is called in TX path
2468  *
2469  *      This function segments the given skb and returns a list of segments.
2470  *
2471  *      It may return NULL if the skb requires no segmentation.  This is
2472  *      only possible when GSO is used for verifying header integrity.
2473  */
2474 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2475                                   netdev_features_t features, bool tx_path)
2476 {
2477         if (unlikely(skb_needs_check(skb, tx_path))) {
2478                 int err;
2479
2480                 skb_warn_bad_offload(skb);
2481
2482                 err = skb_cow_head(skb, 0);
2483                 if (err < 0)
2484                         return ERR_PTR(err);
2485         }
2486
2487         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2488         SKB_GSO_CB(skb)->encap_level = 0;
2489
2490         skb_reset_mac_header(skb);
2491         skb_reset_mac_len(skb);
2492
2493         return skb_mac_gso_segment(skb, features);
2494 }
2495 EXPORT_SYMBOL(__skb_gso_segment);
2496
2497 /* Take action when hardware reception checksum errors are detected. */
2498 #ifdef CONFIG_BUG
2499 void netdev_rx_csum_fault(struct net_device *dev)
2500 {
2501         if (net_ratelimit()) {
2502                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2503                 dump_stack();
2504         }
2505 }
2506 EXPORT_SYMBOL(netdev_rx_csum_fault);
2507 #endif
2508
2509 /* Actually, we should eliminate this check as soon as we know, that:
2510  * 1. IOMMU is present and allows to map all the memory.
2511  * 2. No high memory really exists on this machine.
2512  */
2513
2514 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2515 {
2516 #ifdef CONFIG_HIGHMEM
2517         int i;
2518         if (!(dev->features & NETIF_F_HIGHDMA)) {
2519                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2520                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2521                         if (PageHighMem(skb_frag_page(frag)))
2522                                 return 1;
2523                 }
2524         }
2525
2526         if (PCI_DMA_BUS_IS_PHYS) {
2527                 struct device *pdev = dev->dev.parent;
2528
2529                 if (!pdev)
2530                         return 0;
2531                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2532                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2533                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2534                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2535                                 return 1;
2536                 }
2537         }
2538 #endif
2539         return 0;
2540 }
2541
2542 /* If MPLS offload request, verify we are testing hardware MPLS features
2543  * instead of standard features for the netdev.
2544  */
2545 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2546 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2547                                            netdev_features_t features,
2548                                            __be16 type)
2549 {
2550         if (eth_p_mpls(type))
2551                 features &= skb->dev->mpls_features;
2552
2553         return features;
2554 }
2555 #else
2556 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2557                                            netdev_features_t features,
2558                                            __be16 type)
2559 {
2560         return features;
2561 }
2562 #endif
2563
2564 static netdev_features_t harmonize_features(struct sk_buff *skb,
2565         netdev_features_t features)
2566 {
2567         int tmp;
2568         __be16 type;
2569
2570         type = skb_network_protocol(skb, &tmp);
2571         features = net_mpls_features(skb, features, type);
2572
2573         if (skb->ip_summed != CHECKSUM_NONE &&
2574             !can_checksum_protocol(features, type)) {
2575                 features &= ~NETIF_F_ALL_CSUM;
2576         } else if (illegal_highdma(skb->dev, skb)) {
2577                 features &= ~NETIF_F_SG;
2578         }
2579
2580         return features;
2581 }
2582
2583 netdev_features_t passthru_features_check(struct sk_buff *skb,
2584                                           struct net_device *dev,
2585                                           netdev_features_t features)
2586 {
2587         return features;
2588 }
2589 EXPORT_SYMBOL(passthru_features_check);
2590
2591 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2592                                              struct net_device *dev,
2593                                              netdev_features_t features)
2594 {
2595         return vlan_features_check(skb, features);
2596 }
2597
2598 netdev_features_t netif_skb_features(struct sk_buff *skb)
2599 {
2600         struct net_device *dev = skb->dev;
2601         netdev_features_t features = dev->features;
2602         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2603
2604         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2605                 features &= ~NETIF_F_GSO_MASK;
2606
2607         /* If encapsulation offload request, verify we are testing
2608          * hardware encapsulation features instead of standard
2609          * features for the netdev
2610          */
2611         if (skb->encapsulation)
2612                 features &= dev->hw_enc_features;
2613
2614         if (skb_vlan_tagged(skb))
2615                 features = netdev_intersect_features(features,
2616                                                      dev->vlan_features |
2617                                                      NETIF_F_HW_VLAN_CTAG_TX |
2618                                                      NETIF_F_HW_VLAN_STAG_TX);
2619
2620         if (dev->netdev_ops->ndo_features_check)
2621                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2622                                                                 features);
2623         else
2624                 features &= dflt_features_check(skb, dev, features);
2625
2626         return harmonize_features(skb, features);
2627 }
2628 EXPORT_SYMBOL(netif_skb_features);
2629
2630 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2631                     struct netdev_queue *txq, bool more)
2632 {
2633         unsigned int len;
2634         int rc;
2635
2636         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2637                 dev_queue_xmit_nit(skb, dev);
2638
2639         len = skb->len;
2640         trace_net_dev_start_xmit(skb, dev);
2641         rc = netdev_start_xmit(skb, dev, txq, more);
2642         trace_net_dev_xmit(skb, rc, dev, len);
2643
2644         return rc;
2645 }
2646
2647 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2648                                     struct netdev_queue *txq, int *ret)
2649 {
2650         struct sk_buff *skb = first;
2651         int rc = NETDEV_TX_OK;
2652
2653         while (skb) {
2654                 struct sk_buff *next = skb->next;
2655
2656                 skb->next = NULL;
2657                 rc = xmit_one(skb, dev, txq, next != NULL);
2658                 if (unlikely(!dev_xmit_complete(rc))) {
2659                         skb->next = next;
2660                         goto out;
2661                 }
2662
2663                 skb = next;
2664                 if (netif_xmit_stopped(txq) && skb) {
2665                         rc = NETDEV_TX_BUSY;
2666                         break;
2667                 }
2668         }
2669
2670 out:
2671         *ret = rc;
2672         return skb;
2673 }
2674
2675 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2676                                           netdev_features_t features)
2677 {
2678         if (skb_vlan_tag_present(skb) &&
2679             !vlan_hw_offload_capable(features, skb->vlan_proto))
2680                 skb = __vlan_hwaccel_push_inside(skb);
2681         return skb;
2682 }
2683
2684 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2685 {
2686         netdev_features_t features;
2687
2688         if (skb->next)
2689                 return skb;
2690
2691         features = netif_skb_features(skb);
2692         skb = validate_xmit_vlan(skb, features);
2693         if (unlikely(!skb))
2694                 goto out_null;
2695
2696         if (netif_needs_gso(dev, skb, features)) {
2697                 struct sk_buff *segs;
2698
2699                 segs = skb_gso_segment(skb, features);
2700                 if (IS_ERR(segs)) {
2701                         goto out_kfree_skb;
2702                 } else if (segs) {
2703                         consume_skb(skb);
2704                         skb = segs;
2705                 }
2706         } else {
2707                 if (skb_needs_linearize(skb, features) &&
2708                     __skb_linearize(skb))
2709                         goto out_kfree_skb;
2710
2711                 /* If packet is not checksummed and device does not
2712                  * support checksumming for this protocol, complete
2713                  * checksumming here.
2714                  */
2715                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2716                         if (skb->encapsulation)
2717                                 skb_set_inner_transport_header(skb,
2718                                                                skb_checksum_start_offset(skb));
2719                         else
2720                                 skb_set_transport_header(skb,
2721                                                          skb_checksum_start_offset(skb));
2722                         if (!(features & NETIF_F_ALL_CSUM) &&
2723                             skb_checksum_help(skb))
2724                                 goto out_kfree_skb;
2725                 }
2726         }
2727
2728         return skb;
2729
2730 out_kfree_skb:
2731         kfree_skb(skb);
2732 out_null:
2733         return NULL;
2734 }
2735
2736 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2737 {
2738         struct sk_buff *next, *head = NULL, *tail;
2739
2740         for (; skb != NULL; skb = next) {
2741                 next = skb->next;
2742                 skb->next = NULL;
2743
2744                 /* in case skb wont be segmented, point to itself */
2745                 skb->prev = skb;
2746
2747                 skb = validate_xmit_skb(skb, dev);
2748                 if (!skb)
2749                         continue;
2750
2751                 if (!head)
2752                         head = skb;
2753                 else
2754                         tail->next = skb;
2755                 /* If skb was segmented, skb->prev points to
2756                  * the last segment. If not, it still contains skb.
2757                  */
2758                 tail = skb->prev;
2759         }
2760         return head;
2761 }
2762
2763 static void qdisc_pkt_len_init(struct sk_buff *skb)
2764 {
2765         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2766
2767         qdisc_skb_cb(skb)->pkt_len = skb->len;
2768
2769         /* To get more precise estimation of bytes sent on wire,
2770          * we add to pkt_len the headers size of all segments
2771          */
2772         if (shinfo->gso_size)  {
2773                 unsigned int hdr_len;
2774                 u16 gso_segs = shinfo->gso_segs;
2775
2776                 /* mac layer + network layer */
2777                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2778
2779                 /* + transport layer */
2780                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2781                         hdr_len += tcp_hdrlen(skb);
2782                 else
2783                         hdr_len += sizeof(struct udphdr);
2784
2785                 if (shinfo->gso_type & SKB_GSO_DODGY)
2786                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2787                                                 shinfo->gso_size);
2788
2789                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2790         }
2791 }
2792
2793 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2794                                  struct net_device *dev,
2795                                  struct netdev_queue *txq)
2796 {
2797         spinlock_t *root_lock = qdisc_lock(q);
2798         bool contended;
2799         int rc;
2800
2801         qdisc_pkt_len_init(skb);
2802         qdisc_calculate_pkt_len(skb, q);
2803         /*
2804          * Heuristic to force contended enqueues to serialize on a
2805          * separate lock before trying to get qdisc main lock.
2806          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2807          * often and dequeue packets faster.
2808          */
2809         contended = qdisc_is_running(q);
2810         if (unlikely(contended))
2811                 spin_lock(&q->busylock);
2812
2813         spin_lock(root_lock);
2814         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2815                 kfree_skb(skb);
2816                 rc = NET_XMIT_DROP;
2817         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2818                    qdisc_run_begin(q)) {
2819                 /*
2820                  * This is a work-conserving queue; there are no old skbs
2821                  * waiting to be sent out; and the qdisc is not running -
2822                  * xmit the skb directly.
2823                  */
2824
2825                 qdisc_bstats_update(q, skb);
2826
2827                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2828                         if (unlikely(contended)) {
2829                                 spin_unlock(&q->busylock);
2830                                 contended = false;
2831                         }
2832                         __qdisc_run(q);
2833                 } else
2834                         qdisc_run_end(q);
2835
2836                 rc = NET_XMIT_SUCCESS;
2837         } else {
2838                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2839                 if (qdisc_run_begin(q)) {
2840                         if (unlikely(contended)) {
2841                                 spin_unlock(&q->busylock);
2842                                 contended = false;
2843                         }
2844                         __qdisc_run(q);
2845                 }
2846         }
2847         spin_unlock(root_lock);
2848         if (unlikely(contended))
2849                 spin_unlock(&q->busylock);
2850         return rc;
2851 }
2852
2853 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2854 static void skb_update_prio(struct sk_buff *skb)
2855 {
2856         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2857
2858         if (!skb->priority && skb->sk && map) {
2859                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2860
2861                 if (prioidx < map->priomap_len)
2862                         skb->priority = map->priomap[prioidx];
2863         }
2864 }
2865 #else
2866 #define skb_update_prio(skb)
2867 #endif
2868
2869 static DEFINE_PER_CPU(int, xmit_recursion);
2870 #define RECURSION_LIMIT 10
2871
2872 /**
2873  *      dev_loopback_xmit - loop back @skb
2874  *      @skb: buffer to transmit
2875  */
2876 int dev_loopback_xmit(struct sk_buff *skb)
2877 {
2878         skb_reset_mac_header(skb);
2879         __skb_pull(skb, skb_network_offset(skb));
2880         skb->pkt_type = PACKET_LOOPBACK;
2881         skb->ip_summed = CHECKSUM_UNNECESSARY;
2882         WARN_ON(!skb_dst(skb));
2883         skb_dst_force(skb);
2884         netif_rx_ni(skb);
2885         return 0;
2886 }
2887 EXPORT_SYMBOL(dev_loopback_xmit);
2888
2889 /**
2890  *      __dev_queue_xmit - transmit a buffer
2891  *      @skb: buffer to transmit
2892  *      @accel_priv: private data used for L2 forwarding offload
2893  *
2894  *      Queue a buffer for transmission to a network device. The caller must
2895  *      have set the device and priority and built the buffer before calling
2896  *      this function. The function can be called from an interrupt.
2897  *
2898  *      A negative errno code is returned on a failure. A success does not
2899  *      guarantee the frame will be transmitted as it may be dropped due
2900  *      to congestion or traffic shaping.
2901  *
2902  * -----------------------------------------------------------------------------------
2903  *      I notice this method can also return errors from the queue disciplines,
2904  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2905  *      be positive.
2906  *
2907  *      Regardless of the return value, the skb is consumed, so it is currently
2908  *      difficult to retry a send to this method.  (You can bump the ref count
2909  *      before sending to hold a reference for retry if you are careful.)
2910  *
2911  *      When calling this method, interrupts MUST be enabled.  This is because
2912  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2913  *          --BLG
2914  */
2915 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2916 {
2917         struct net_device *dev = skb->dev;
2918         struct netdev_queue *txq;
2919         struct Qdisc *q;
2920         int rc = -ENOMEM;
2921
2922         skb_reset_mac_header(skb);
2923
2924         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2925                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2926
2927         /* Disable soft irqs for various locks below. Also
2928          * stops preemption for RCU.
2929          */
2930         rcu_read_lock_bh();
2931
2932         skb_update_prio(skb);
2933
2934         /* If device/qdisc don't need skb->dst, release it right now while
2935          * its hot in this cpu cache.
2936          */
2937         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2938                 skb_dst_drop(skb);
2939         else
2940                 skb_dst_force(skb);
2941
2942         txq = netdev_pick_tx(dev, skb, accel_priv);
2943         q = rcu_dereference_bh(txq->qdisc);
2944
2945 #ifdef CONFIG_NET_CLS_ACT
2946         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2947 #endif
2948         trace_net_dev_queue(skb);
2949         if (q->enqueue) {
2950                 rc = __dev_xmit_skb(skb, q, dev, txq);
2951                 goto out;
2952         }
2953
2954         /* The device has no queue. Common case for software devices:
2955            loopback, all the sorts of tunnels...
2956
2957            Really, it is unlikely that netif_tx_lock protection is necessary
2958            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2959            counters.)
2960            However, it is possible, that they rely on protection
2961            made by us here.
2962
2963            Check this and shot the lock. It is not prone from deadlocks.
2964            Either shot noqueue qdisc, it is even simpler 8)
2965          */
2966         if (dev->flags & IFF_UP) {
2967                 int cpu = smp_processor_id(); /* ok because BHs are off */
2968
2969                 if (txq->xmit_lock_owner != cpu) {
2970
2971                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2972                                 goto recursion_alert;
2973
2974                         skb = validate_xmit_skb(skb, dev);
2975                         if (!skb)
2976                                 goto drop;
2977
2978                         HARD_TX_LOCK(dev, txq, cpu);
2979
2980                         if (!netif_xmit_stopped(txq)) {
2981                                 __this_cpu_inc(xmit_recursion);
2982                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2983                                 __this_cpu_dec(xmit_recursion);
2984                                 if (dev_xmit_complete(rc)) {
2985                                         HARD_TX_UNLOCK(dev, txq);
2986                                         goto out;
2987                                 }
2988                         }
2989                         HARD_TX_UNLOCK(dev, txq);
2990                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2991                                              dev->name);
2992                 } else {
2993                         /* Recursion is detected! It is possible,
2994                          * unfortunately
2995                          */
2996 recursion_alert:
2997                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2998                                              dev->name);
2999                 }
3000         }
3001
3002         rc = -ENETDOWN;
3003 drop:
3004         rcu_read_unlock_bh();
3005
3006         atomic_long_inc(&dev->tx_dropped);
3007         kfree_skb_list(skb);
3008         return rc;
3009 out:
3010         rcu_read_unlock_bh();
3011         return rc;
3012 }
3013
3014 int dev_queue_xmit(struct sk_buff *skb)
3015 {
3016         return __dev_queue_xmit(skb, NULL);
3017 }
3018 EXPORT_SYMBOL(dev_queue_xmit);
3019
3020 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3021 {
3022         return __dev_queue_xmit(skb, accel_priv);
3023 }
3024 EXPORT_SYMBOL(dev_queue_xmit_accel);
3025
3026
3027 /*=======================================================================
3028                         Receiver routines
3029   =======================================================================*/
3030
3031 int netdev_max_backlog __read_mostly = 1000;
3032 EXPORT_SYMBOL(netdev_max_backlog);
3033
3034 int netdev_tstamp_prequeue __read_mostly = 1;
3035 int netdev_budget __read_mostly = 300;
3036 int weight_p __read_mostly = 64;            /* old backlog weight */
3037
3038 /* Called with irq disabled */
3039 static inline void ____napi_schedule(struct softnet_data *sd,
3040                                      struct napi_struct *napi)
3041 {
3042         list_add_tail(&napi->poll_list, &sd->poll_list);
3043         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3044 }
3045
3046 #ifdef CONFIG_RPS
3047
3048 /* One global table that all flow-based protocols share. */
3049 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3050 EXPORT_SYMBOL(rps_sock_flow_table);
3051 u32 rps_cpu_mask __read_mostly;
3052 EXPORT_SYMBOL(rps_cpu_mask);
3053
3054 struct static_key rps_needed __read_mostly;
3055
3056 static struct rps_dev_flow *
3057 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3058             struct rps_dev_flow *rflow, u16 next_cpu)
3059 {
3060         if (next_cpu != RPS_NO_CPU) {
3061 #ifdef CONFIG_RFS_ACCEL
3062                 struct netdev_rx_queue *rxqueue;
3063                 struct rps_dev_flow_table *flow_table;
3064                 struct rps_dev_flow *old_rflow;
3065                 u32 flow_id;
3066                 u16 rxq_index;
3067                 int rc;
3068
3069                 /* Should we steer this flow to a different hardware queue? */
3070                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3071                     !(dev->features & NETIF_F_NTUPLE))
3072                         goto out;
3073                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3074                 if (rxq_index == skb_get_rx_queue(skb))
3075                         goto out;
3076
3077                 rxqueue = dev->_rx + rxq_index;
3078                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3079                 if (!flow_table)
3080                         goto out;
3081                 flow_id = skb_get_hash(skb) & flow_table->mask;
3082                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3083                                                         rxq_index, flow_id);
3084                 if (rc < 0)
3085                         goto out;
3086                 old_rflow = rflow;
3087                 rflow = &flow_table->flows[flow_id];
3088                 rflow->filter = rc;
3089                 if (old_rflow->filter == rflow->filter)
3090                         old_rflow->filter = RPS_NO_FILTER;
3091         out:
3092 #endif
3093                 rflow->last_qtail =
3094                         per_cpu(softnet_data, next_cpu).input_queue_head;
3095         }
3096
3097         rflow->cpu = next_cpu;
3098         return rflow;
3099 }
3100
3101 /*
3102  * get_rps_cpu is called from netif_receive_skb and returns the target
3103  * CPU from the RPS map of the receiving queue for a given skb.
3104  * rcu_read_lock must be held on entry.
3105  */
3106 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3107                        struct rps_dev_flow **rflowp)
3108 {
3109         const struct rps_sock_flow_table *sock_flow_table;
3110         struct netdev_rx_queue *rxqueue = dev->_rx;
3111         struct rps_dev_flow_table *flow_table;
3112         struct rps_map *map;
3113         int cpu = -1;
3114         u32 tcpu;
3115         u32 hash;
3116
3117         if (skb_rx_queue_recorded(skb)) {
3118                 u16 index = skb_get_rx_queue(skb);
3119
3120                 if (unlikely(index >= dev->real_num_rx_queues)) {
3121                         WARN_ONCE(dev->real_num_rx_queues > 1,
3122                                   "%s received packet on queue %u, but number "
3123                                   "of RX queues is %u\n",
3124                                   dev->name, index, dev->real_num_rx_queues);
3125                         goto done;
3126                 }
3127                 rxqueue += index;
3128         }
3129
3130         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3131
3132         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3133         map = rcu_dereference(rxqueue->rps_map);
3134         if (!flow_table && !map)
3135                 goto done;
3136
3137         skb_reset_network_header(skb);
3138         hash = skb_get_hash(skb);
3139         if (!hash)
3140                 goto done;
3141
3142         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3143         if (flow_table && sock_flow_table) {
3144                 struct rps_dev_flow *rflow;
3145                 u32 next_cpu;
3146                 u32 ident;
3147
3148                 /* First check into global flow table if there is a match */
3149                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3150                 if ((ident ^ hash) & ~rps_cpu_mask)
3151                         goto try_rps;
3152
3153                 next_cpu = ident & rps_cpu_mask;
3154
3155                 /* OK, now we know there is a match,
3156                  * we can look at the local (per receive queue) flow table
3157                  */
3158                 rflow = &flow_table->flows[hash & flow_table->mask];
3159                 tcpu = rflow->cpu;
3160
3161                 /*
3162                  * If the desired CPU (where last recvmsg was done) is
3163                  * different from current CPU (one in the rx-queue flow
3164                  * table entry), switch if one of the following holds:
3165                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3166                  *   - Current CPU is offline.
3167                  *   - The current CPU's queue tail has advanced beyond the
3168                  *     last packet that was enqueued using this table entry.
3169                  *     This guarantees that all previous packets for the flow
3170                  *     have been dequeued, thus preserving in order delivery.
3171                  */
3172                 if (unlikely(tcpu != next_cpu) &&
3173                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3174                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3175                       rflow->last_qtail)) >= 0)) {
3176                         tcpu = next_cpu;
3177                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3178                 }
3179
3180                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3181                         *rflowp = rflow;
3182                         cpu = tcpu;
3183                         goto done;
3184                 }
3185         }
3186
3187 try_rps:
3188
3189         if (map) {
3190                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3191                 if (cpu_online(tcpu)) {
3192                         cpu = tcpu;
3193                         goto done;
3194                 }
3195         }
3196
3197 done:
3198         return cpu;
3199 }
3200
3201 #ifdef CONFIG_RFS_ACCEL
3202
3203 /**
3204  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3205  * @dev: Device on which the filter was set
3206  * @rxq_index: RX queue index
3207  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3208  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3209  *
3210  * Drivers that implement ndo_rx_flow_steer() should periodically call
3211  * this function for each installed filter and remove the filters for
3212  * which it returns %true.
3213  */
3214 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3215                          u32 flow_id, u16 filter_id)
3216 {
3217         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3218         struct rps_dev_flow_table *flow_table;
3219         struct rps_dev_flow *rflow;
3220         bool expire = true;
3221         int cpu;
3222
3223         rcu_read_lock();
3224         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3225         if (flow_table && flow_id <= flow_table->mask) {
3226                 rflow = &flow_table->flows[flow_id];
3227                 cpu = ACCESS_ONCE(rflow->cpu);
3228                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3229                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3230                            rflow->last_qtail) <
3231                      (int)(10 * flow_table->mask)))
3232                         expire = false;
3233         }
3234         rcu_read_unlock();
3235         return expire;
3236 }
3237 EXPORT_SYMBOL(rps_may_expire_flow);
3238
3239 #endif /* CONFIG_RFS_ACCEL */
3240
3241 /* Called from hardirq (IPI) context */
3242 static void rps_trigger_softirq(void *data)
3243 {
3244         struct softnet_data *sd = data;
3245
3246         ____napi_schedule(sd, &sd->backlog);
3247         sd->received_rps++;
3248 }
3249
3250 #endif /* CONFIG_RPS */
3251
3252 /*
3253  * Check if this softnet_data structure is another cpu one
3254  * If yes, queue it to our IPI list and return 1
3255  * If no, return 0
3256  */
3257 static int rps_ipi_queued(struct softnet_data *sd)
3258 {
3259 #ifdef CONFIG_RPS
3260         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3261
3262         if (sd != mysd) {
3263                 sd->rps_ipi_next = mysd->rps_ipi_list;
3264                 mysd->rps_ipi_list = sd;
3265
3266                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3267                 return 1;
3268         }
3269 #endif /* CONFIG_RPS */
3270         return 0;
3271 }
3272
3273 #ifdef CONFIG_NET_FLOW_LIMIT
3274 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3275 #endif
3276
3277 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3278 {
3279 #ifdef CONFIG_NET_FLOW_LIMIT
3280         struct sd_flow_limit *fl;
3281         struct softnet_data *sd;
3282         unsigned int old_flow, new_flow;
3283
3284         if (qlen < (netdev_max_backlog >> 1))
3285                 return false;
3286
3287         sd = this_cpu_ptr(&softnet_data);
3288
3289         rcu_read_lock();
3290         fl = rcu_dereference(sd->flow_limit);
3291         if (fl) {
3292                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3293                 old_flow = fl->history[fl->history_head];
3294                 fl->history[fl->history_head] = new_flow;
3295
3296                 fl->history_head++;
3297                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3298
3299                 if (likely(fl->buckets[old_flow]))
3300                         fl->buckets[old_flow]--;
3301
3302                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3303                         fl->count++;
3304                         rcu_read_unlock();
3305                         return true;
3306                 }
3307         }
3308         rcu_read_unlock();
3309 #endif
3310         return false;
3311 }
3312
3313 /*
3314  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3315  * queue (may be a remote CPU queue).
3316  */
3317 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3318                               unsigned int *qtail)
3319 {
3320         struct softnet_data *sd;
3321         unsigned long flags;
3322         unsigned int qlen;
3323
3324         sd = &per_cpu(softnet_data, cpu);
3325
3326         local_irq_save(flags);
3327
3328         rps_lock(sd);
3329         qlen = skb_queue_len(&sd->input_pkt_queue);
3330         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3331                 if (qlen) {
3332 enqueue:
3333                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3334                         input_queue_tail_incr_save(sd, qtail);
3335                         rps_unlock(sd);
3336                         local_irq_restore(flags);
3337                         return NET_RX_SUCCESS;
3338                 }
3339
3340                 /* Schedule NAPI for backlog device
3341                  * We can use non atomic operation since we own the queue lock
3342                  */
3343                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3344                         if (!rps_ipi_queued(sd))
3345                                 ____napi_schedule(sd, &sd->backlog);
3346                 }
3347                 goto enqueue;
3348         }
3349
3350         sd->dropped++;
3351         rps_unlock(sd);
3352
3353         local_irq_restore(flags);
3354
3355         atomic_long_inc(&skb->dev->rx_dropped);
3356         kfree_skb(skb);
3357         return NET_RX_DROP;
3358 }
3359
3360 static int netif_rx_internal(struct sk_buff *skb)
3361 {
3362         int ret;
3363
3364         net_timestamp_check(netdev_tstamp_prequeue, skb);
3365
3366         trace_netif_rx(skb);
3367 #ifdef CONFIG_RPS
3368         if (static_key_false(&rps_needed)) {
3369                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3370                 int cpu;
3371
3372                 preempt_disable();
3373                 rcu_read_lock();
3374
3375                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3376                 if (cpu < 0)
3377                         cpu = smp_processor_id();
3378
3379                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3380
3381                 rcu_read_unlock();
3382                 preempt_enable();
3383         } else
3384 #endif
3385         {
3386                 unsigned int qtail;
3387                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3388                 put_cpu();
3389         }
3390         return ret;
3391 }
3392
3393 /**
3394  *      netif_rx        -       post buffer to the network code
3395  *      @skb: buffer to post
3396  *
3397  *      This function receives a packet from a device driver and queues it for
3398  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3399  *      may be dropped during processing for congestion control or by the
3400  *      protocol layers.
3401  *
3402  *      return values:
3403  *      NET_RX_SUCCESS  (no congestion)
3404  *      NET_RX_DROP     (packet was dropped)
3405  *
3406  */
3407
3408 int netif_rx(struct sk_buff *skb)
3409 {
3410         trace_netif_rx_entry(skb);
3411
3412         return netif_rx_internal(skb);
3413 }
3414 EXPORT_SYMBOL(netif_rx);
3415
3416 int netif_rx_ni(struct sk_buff *skb)
3417 {
3418         int err;
3419
3420         trace_netif_rx_ni_entry(skb);
3421
3422         preempt_disable();
3423         err = netif_rx_internal(skb);
3424         if (local_softirq_pending())
3425                 do_softirq();
3426         preempt_enable();
3427
3428         return err;
3429 }
3430 EXPORT_SYMBOL(netif_rx_ni);
3431
3432 static void net_tx_action(struct softirq_action *h)
3433 {
3434         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3435
3436         if (sd->completion_queue) {
3437                 struct sk_buff *clist;
3438
3439                 local_irq_disable();
3440                 clist = sd->completion_queue;
3441                 sd->completion_queue = NULL;
3442                 local_irq_enable();
3443
3444                 while (clist) {
3445                         struct sk_buff *skb = clist;
3446                         clist = clist->next;
3447
3448                         WARN_ON(atomic_read(&skb->users));
3449                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3450                                 trace_consume_skb(skb);
3451                         else
3452                                 trace_kfree_skb(skb, net_tx_action);
3453                         __kfree_skb(skb);
3454                 }
3455         }
3456
3457         if (sd->output_queue) {
3458                 struct Qdisc *head;
3459
3460                 local_irq_disable();
3461                 head = sd->output_queue;
3462                 sd->output_queue = NULL;
3463                 sd->output_queue_tailp = &sd->output_queue;
3464                 local_irq_enable();
3465
3466                 while (head) {
3467                         struct Qdisc *q = head;
3468                         spinlock_t *root_lock;
3469
3470                         head = head->next_sched;
3471
3472                         root_lock = qdisc_lock(q);
3473                         if (spin_trylock(root_lock)) {
3474                                 smp_mb__before_atomic();
3475                                 clear_bit(__QDISC_STATE_SCHED,
3476                                           &q->state);
3477                                 qdisc_run(q);
3478                                 spin_unlock(root_lock);
3479                         } else {
3480                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3481                                               &q->state)) {
3482                                         __netif_reschedule(q);
3483                                 } else {
3484                                         smp_mb__before_atomic();
3485                                         clear_bit(__QDISC_STATE_SCHED,
3486                                                   &q->state);
3487                                 }
3488                         }
3489                 }
3490         }
3491 }
3492
3493 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3494     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3495 /* This hook is defined here for ATM LANE */
3496 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3497                              unsigned char *addr) __read_mostly;
3498 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3499 #endif
3500
3501 #ifdef CONFIG_NET_CLS_ACT
3502 /* TODO: Maybe we should just force sch_ingress to be compiled in
3503  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3504  * a compare and 2 stores extra right now if we dont have it on
3505  * but have CONFIG_NET_CLS_ACT
3506  * NOTE: This doesn't stop any functionality; if you dont have
3507  * the ingress scheduler, you just can't add policies on ingress.
3508  *
3509  */
3510 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3511 {
3512         struct net_device *dev = skb->dev;
3513         u32 ttl = G_TC_RTTL(skb->tc_verd);
3514         int result = TC_ACT_OK;
3515         struct Qdisc *q;
3516
3517         if (unlikely(MAX_RED_LOOP < ttl++)) {
3518                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3519                                      skb->skb_iif, dev->ifindex);
3520                 return TC_ACT_SHOT;
3521         }
3522
3523         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3524         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3525
3526         q = rcu_dereference(rxq->qdisc);
3527         if (q != &noop_qdisc) {
3528                 spin_lock(qdisc_lock(q));
3529                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3530                         result = qdisc_enqueue_root(skb, q);
3531                 spin_unlock(qdisc_lock(q));
3532         }
3533
3534         return result;
3535 }
3536
3537 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3538                                          struct packet_type **pt_prev,
3539                                          int *ret, struct net_device *orig_dev)
3540 {
3541         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3542
3543         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3544                 goto out;
3545
3546         if (*pt_prev) {
3547                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3548                 *pt_prev = NULL;
3549         }
3550
3551         switch (ing_filter(skb, rxq)) {
3552         case TC_ACT_SHOT:
3553         case TC_ACT_STOLEN:
3554                 kfree_skb(skb);
3555                 return NULL;
3556         }
3557
3558 out:
3559         skb->tc_verd = 0;
3560         return skb;
3561 }
3562 #endif
3563
3564 /**
3565  *      netdev_rx_handler_register - register receive handler
3566  *      @dev: device to register a handler for
3567  *      @rx_handler: receive handler to register
3568  *      @rx_handler_data: data pointer that is used by rx handler
3569  *
3570  *      Register a receive handler for a device. This handler will then be
3571  *      called from __netif_receive_skb. A negative errno code is returned
3572  *      on a failure.
3573  *
3574  *      The caller must hold the rtnl_mutex.
3575  *
3576  *      For a general description of rx_handler, see enum rx_handler_result.
3577  */
3578 int netdev_rx_handler_register(struct net_device *dev,
3579                                rx_handler_func_t *rx_handler,
3580                                void *rx_handler_data)
3581 {
3582         ASSERT_RTNL();
3583
3584         if (dev->rx_handler)
3585                 return -EBUSY;
3586
3587         /* Note: rx_handler_data must be set before rx_handler */
3588         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3589         rcu_assign_pointer(dev->rx_handler, rx_handler);
3590
3591         return 0;
3592 }
3593 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3594
3595 /**
3596  *      netdev_rx_handler_unregister - unregister receive handler
3597  *      @dev: device to unregister a handler from
3598  *
3599  *      Unregister a receive handler from a device.
3600  *
3601  *      The caller must hold the rtnl_mutex.
3602  */
3603 void netdev_rx_handler_unregister(struct net_device *dev)
3604 {
3605
3606         ASSERT_RTNL();
3607         RCU_INIT_POINTER(dev->rx_handler, NULL);
3608         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3609          * section has a guarantee to see a non NULL rx_handler_data
3610          * as well.
3611          */
3612         synchronize_net();
3613         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3614 }
3615 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3616
3617 /*
3618  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3619  * the special handling of PFMEMALLOC skbs.
3620  */
3621 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3622 {
3623         switch (skb->protocol) {
3624         case htons(ETH_P_ARP):
3625         case htons(ETH_P_IP):
3626         case htons(ETH_P_IPV6):
3627         case htons(ETH_P_8021Q):
3628         case htons(ETH_P_8021AD):
3629                 return true;
3630         default:
3631                 return false;
3632         }
3633 }
3634
3635 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3636 {
3637         struct packet_type *ptype, *pt_prev;
3638         rx_handler_func_t *rx_handler;
3639         struct net_device *orig_dev;
3640         bool deliver_exact = false;
3641         int ret = NET_RX_DROP;
3642         __be16 type;
3643
3644         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3645
3646         trace_netif_receive_skb(skb);
3647
3648         orig_dev = skb->dev;
3649
3650         skb_reset_network_header(skb);
3651         if (!skb_transport_header_was_set(skb))
3652                 skb_reset_transport_header(skb);
3653         skb_reset_mac_len(skb);
3654
3655         pt_prev = NULL;
3656
3657         rcu_read_lock();
3658
3659 another_round:
3660         skb->skb_iif = skb->dev->ifindex;
3661
3662         __this_cpu_inc(softnet_data.processed);
3663
3664         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3665             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3666                 skb = skb_vlan_untag(skb);
3667                 if (unlikely(!skb))
3668                         goto unlock;
3669         }
3670
3671 #ifdef CONFIG_NET_CLS_ACT
3672         if (skb->tc_verd & TC_NCLS) {
3673                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3674                 goto ncls;
3675         }
3676 #endif
3677
3678         if (pfmemalloc)
3679                 goto skip_taps;
3680
3681         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3682                 if (pt_prev)
3683                         ret = deliver_skb(skb, pt_prev, orig_dev);
3684                 pt_prev = ptype;
3685         }
3686
3687         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3688                 if (pt_prev)
3689                         ret = deliver_skb(skb, pt_prev, orig_dev);
3690                 pt_prev = ptype;
3691         }
3692
3693 skip_taps:
3694 #ifdef CONFIG_NET_CLS_ACT
3695         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3696         if (!skb)
3697                 goto unlock;
3698 ncls:
3699 #endif
3700
3701         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3702                 goto drop;
3703
3704         if (skb_vlan_tag_present(skb)) {
3705                 if (pt_prev) {
3706                         ret = deliver_skb(skb, pt_prev, orig_dev);
3707                         pt_prev = NULL;
3708                 }
3709                 if (vlan_do_receive(&skb))
3710                         goto another_round;
3711                 else if (unlikely(!skb))
3712                         goto unlock;
3713         }
3714
3715         rx_handler = rcu_dereference(skb->dev->rx_handler);
3716         if (rx_handler) {
3717                 if (pt_prev) {
3718                         ret = deliver_skb(skb, pt_prev, orig_dev);
3719                         pt_prev = NULL;
3720                 }
3721                 switch (rx_handler(&skb)) {
3722                 case RX_HANDLER_CONSUMED:
3723                         ret = NET_RX_SUCCESS;
3724                         goto unlock;
3725                 case RX_HANDLER_ANOTHER:
3726                         goto another_round;
3727                 case RX_HANDLER_EXACT:
3728                         deliver_exact = true;
3729                 case RX_HANDLER_PASS:
3730                         break;
3731                 default:
3732                         BUG();
3733                 }
3734         }
3735
3736         if (unlikely(skb_vlan_tag_present(skb))) {
3737                 if (skb_vlan_tag_get_id(skb))
3738                         skb->pkt_type = PACKET_OTHERHOST;
3739                 /* Note: we might in the future use prio bits
3740                  * and set skb->priority like in vlan_do_receive()
3741                  * For the time being, just ignore Priority Code Point
3742                  */
3743                 skb->vlan_tci = 0;
3744         }
3745
3746         type = skb->protocol;
3747
3748         /* deliver only exact match when indicated */
3749         if (likely(!deliver_exact)) {
3750                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3751                                        &ptype_base[ntohs(type) &
3752                                                    PTYPE_HASH_MASK]);
3753         }
3754
3755         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3756                                &orig_dev->ptype_specific);
3757
3758         if (unlikely(skb->dev != orig_dev)) {
3759                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3760                                        &skb->dev->ptype_specific);
3761         }
3762
3763         if (pt_prev) {
3764                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3765                         goto drop;
3766                 else
3767                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3768         } else {
3769 drop:
3770                 atomic_long_inc(&skb->dev->rx_dropped);
3771                 kfree_skb(skb);
3772                 /* Jamal, now you will not able to escape explaining
3773                  * me how you were going to use this. :-)
3774                  */
3775                 ret = NET_RX_DROP;
3776         }
3777
3778 unlock:
3779         rcu_read_unlock();
3780         return ret;
3781 }
3782
3783 static int __netif_receive_skb(struct sk_buff *skb)
3784 {
3785         int ret;
3786
3787         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3788                 unsigned long pflags = current->flags;
3789
3790                 /*
3791                  * PFMEMALLOC skbs are special, they should
3792                  * - be delivered to SOCK_MEMALLOC sockets only
3793                  * - stay away from userspace
3794                  * - have bounded memory usage
3795                  *
3796                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3797                  * context down to all allocation sites.
3798                  */
3799                 current->flags |= PF_MEMALLOC;
3800                 ret = __netif_receive_skb_core(skb, true);
3801                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3802         } else
3803                 ret = __netif_receive_skb_core(skb, false);
3804
3805         return ret;
3806 }
3807
3808 static int netif_receive_skb_internal(struct sk_buff *skb)
3809 {
3810         net_timestamp_check(netdev_tstamp_prequeue, skb);
3811
3812         if (skb_defer_rx_timestamp(skb))
3813                 return NET_RX_SUCCESS;
3814
3815 #ifdef CONFIG_RPS
3816         if (static_key_false(&rps_needed)) {
3817                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3818                 int cpu, ret;
3819
3820                 rcu_read_lock();
3821
3822                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3823
3824                 if (cpu >= 0) {
3825                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3826                         rcu_read_unlock();
3827                         return ret;
3828                 }
3829                 rcu_read_unlock();
3830         }
3831 #endif
3832         return __netif_receive_skb(skb);
3833 }
3834
3835 /**
3836  *      netif_receive_skb - process receive buffer from network
3837  *      @skb: buffer to process
3838  *
3839  *      netif_receive_skb() is the main receive data processing function.
3840  *      It always succeeds. The buffer may be dropped during processing
3841  *      for congestion control or by the protocol layers.
3842  *
3843  *      This function may only be called from softirq context and interrupts
3844  *      should be enabled.
3845  *
3846  *      Return values (usually ignored):
3847  *      NET_RX_SUCCESS: no congestion
3848  *      NET_RX_DROP: packet was dropped
3849  */
3850 int netif_receive_skb(struct sk_buff *skb)
3851 {
3852         trace_netif_receive_skb_entry(skb);
3853
3854         return netif_receive_skb_internal(skb);
3855 }
3856 EXPORT_SYMBOL(netif_receive_skb);
3857
3858 /* Network device is going away, flush any packets still pending
3859  * Called with irqs disabled.
3860  */
3861 static void flush_backlog(void *arg)
3862 {
3863         struct net_device *dev = arg;
3864         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3865         struct sk_buff *skb, *tmp;
3866
3867         rps_lock(sd);
3868         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3869                 if (skb->dev == dev) {
3870                         __skb_unlink(skb, &sd->input_pkt_queue);
3871                         kfree_skb(skb);
3872                         input_queue_head_incr(sd);
3873                 }
3874         }
3875         rps_unlock(sd);
3876
3877         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3878                 if (skb->dev == dev) {
3879                         __skb_unlink(skb, &sd->process_queue);
3880                         kfree_skb(skb);
3881                         input_queue_head_incr(sd);
3882                 }
3883         }
3884 }
3885
3886 static int napi_gro_complete(struct sk_buff *skb)
3887 {
3888         struct packet_offload *ptype;
3889         __be16 type = skb->protocol;
3890         struct list_head *head = &offload_base;
3891         int err = -ENOENT;
3892
3893         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3894
3895         if (NAPI_GRO_CB(skb)->count == 1) {
3896                 skb_shinfo(skb)->gso_size = 0;
3897                 goto out;
3898         }
3899
3900         rcu_read_lock();
3901         list_for_each_entry_rcu(ptype, head, list) {
3902                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3903                         continue;
3904
3905                 err = ptype->callbacks.gro_complete(skb, 0);
3906                 break;
3907         }
3908         rcu_read_unlock();
3909
3910         if (err) {
3911                 WARN_ON(&ptype->list == head);
3912                 kfree_skb(skb);
3913                 return NET_RX_SUCCESS;
3914         }
3915
3916 out:
3917         return netif_receive_skb_internal(skb);
3918 }
3919
3920 /* napi->gro_list contains packets ordered by age.
3921  * youngest packets at the head of it.
3922  * Complete skbs in reverse order to reduce latencies.
3923  */
3924 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3925 {
3926         struct sk_buff *skb, *prev = NULL;
3927
3928         /* scan list and build reverse chain */
3929         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3930                 skb->prev = prev;
3931                 prev = skb;
3932         }
3933
3934         for (skb = prev; skb; skb = prev) {
3935                 skb->next = NULL;
3936
3937                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3938                         return;
3939
3940                 prev = skb->prev;
3941                 napi_gro_complete(skb);
3942                 napi->gro_count--;
3943         }
3944
3945         napi->gro_list = NULL;
3946 }
3947 EXPORT_SYMBOL(napi_gro_flush);
3948
3949 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3950 {
3951         struct sk_buff *p;
3952         unsigned int maclen = skb->dev->hard_header_len;
3953         u32 hash = skb_get_hash_raw(skb);
3954
3955         for (p = napi->gro_list; p; p = p->next) {
3956                 unsigned long diffs;
3957
3958                 NAPI_GRO_CB(p)->flush = 0;
3959
3960                 if (hash != skb_get_hash_raw(p)) {
3961                         NAPI_GRO_CB(p)->same_flow = 0;
3962                         continue;
3963                 }
3964
3965                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3966                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3967                 if (maclen == ETH_HLEN)
3968                         diffs |= compare_ether_header(skb_mac_header(p),
3969                                                       skb_mac_header(skb));
3970                 else if (!diffs)
3971                         diffs = memcmp(skb_mac_header(p),
3972                                        skb_mac_header(skb),
3973                                        maclen);
3974                 NAPI_GRO_CB(p)->same_flow = !diffs;
3975         }
3976 }
3977
3978 static void skb_gro_reset_offset(struct sk_buff *skb)
3979 {
3980         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3981         const skb_frag_t *frag0 = &pinfo->frags[0];
3982
3983         NAPI_GRO_CB(skb)->data_offset = 0;
3984         NAPI_GRO_CB(skb)->frag0 = NULL;
3985         NAPI_GRO_CB(skb)->frag0_len = 0;
3986
3987         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3988             pinfo->nr_frags &&
3989             !PageHighMem(skb_frag_page(frag0))) {
3990                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3991                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3992         }
3993 }
3994
3995 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3996 {
3997         struct skb_shared_info *pinfo = skb_shinfo(skb);
3998
3999         BUG_ON(skb->end - skb->tail < grow);
4000
4001         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4002
4003         skb->data_len -= grow;
4004         skb->tail += grow;
4005
4006         pinfo->frags[0].page_offset += grow;
4007         skb_frag_size_sub(&pinfo->frags[0], grow);
4008
4009         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4010                 skb_frag_unref(skb, 0);
4011                 memmove(pinfo->frags, pinfo->frags + 1,
4012                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4013         }
4014 }
4015
4016 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4017 {
4018         struct sk_buff **pp = NULL;
4019         struct packet_offload *ptype;
4020         __be16 type = skb->protocol;
4021         struct list_head *head = &offload_base;
4022         int same_flow;
4023         enum gro_result ret;
4024         int grow;
4025
4026         if (!(skb->dev->features & NETIF_F_GRO))
4027                 goto normal;
4028
4029         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4030                 goto normal;
4031
4032         gro_list_prepare(napi, skb);
4033
4034         rcu_read_lock();
4035         list_for_each_entry_rcu(ptype, head, list) {
4036                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4037                         continue;
4038
4039                 skb_set_network_header(skb, skb_gro_offset(skb));
4040                 skb_reset_mac_len(skb);
4041                 NAPI_GRO_CB(skb)->same_flow = 0;
4042                 NAPI_GRO_CB(skb)->flush = 0;
4043                 NAPI_GRO_CB(skb)->free = 0;
4044                 NAPI_GRO_CB(skb)->udp_mark = 0;
4045                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4046
4047                 /* Setup for GRO checksum validation */
4048                 switch (skb->ip_summed) {
4049                 case CHECKSUM_COMPLETE:
4050                         NAPI_GRO_CB(skb)->csum = skb->csum;
4051                         NAPI_GRO_CB(skb)->csum_valid = 1;
4052                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4053                         break;
4054                 case CHECKSUM_UNNECESSARY:
4055                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4056                         NAPI_GRO_CB(skb)->csum_valid = 0;
4057                         break;
4058                 default:
4059                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4060                         NAPI_GRO_CB(skb)->csum_valid = 0;
4061                 }
4062
4063                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4064                 break;
4065         }
4066         rcu_read_unlock();
4067
4068         if (&ptype->list == head)
4069                 goto normal;
4070
4071         same_flow = NAPI_GRO_CB(skb)->same_flow;
4072         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4073
4074         if (pp) {
4075                 struct sk_buff *nskb = *pp;
4076
4077                 *pp = nskb->next;
4078                 nskb->next = NULL;
4079                 napi_gro_complete(nskb);
4080                 napi->gro_count--;
4081         }
4082
4083         if (same_flow)
4084                 goto ok;
4085
4086         if (NAPI_GRO_CB(skb)->flush)
4087                 goto normal;
4088
4089         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4090                 struct sk_buff *nskb = napi->gro_list;
4091
4092                 /* locate the end of the list to select the 'oldest' flow */
4093                 while (nskb->next) {
4094                         pp = &nskb->next;
4095                         nskb = *pp;
4096                 }
4097                 *pp = NULL;
4098                 nskb->next = NULL;
4099                 napi_gro_complete(nskb);
4100         } else {
4101                 napi->gro_count++;
4102         }
4103         NAPI_GRO_CB(skb)->count = 1;
4104         NAPI_GRO_CB(skb)->age = jiffies;
4105         NAPI_GRO_CB(skb)->last = skb;
4106         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4107         skb->next = napi->gro_list;
4108         napi->gro_list = skb;
4109         ret = GRO_HELD;
4110
4111 pull:
4112         grow = skb_gro_offset(skb) - skb_headlen(skb);
4113         if (grow > 0)
4114                 gro_pull_from_frag0(skb, grow);
4115 ok:
4116         return ret;
4117
4118 normal:
4119         ret = GRO_NORMAL;
4120         goto pull;
4121 }
4122
4123 struct packet_offload *gro_find_receive_by_type(__be16 type)
4124 {
4125         struct list_head *offload_head = &offload_base;
4126         struct packet_offload *ptype;
4127
4128         list_for_each_entry_rcu(ptype, offload_head, list) {
4129                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4130                         continue;
4131                 return ptype;
4132         }
4133         return NULL;
4134 }
4135 EXPORT_SYMBOL(gro_find_receive_by_type);
4136
4137 struct packet_offload *gro_find_complete_by_type(__be16 type)
4138 {
4139         struct list_head *offload_head = &offload_base;
4140         struct packet_offload *ptype;
4141
4142         list_for_each_entry_rcu(ptype, offload_head, list) {
4143                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4144                         continue;
4145                 return ptype;
4146         }
4147         return NULL;
4148 }
4149 EXPORT_SYMBOL(gro_find_complete_by_type);
4150
4151 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4152 {
4153         switch (ret) {
4154         case GRO_NORMAL:
4155                 if (netif_receive_skb_internal(skb))
4156                         ret = GRO_DROP;
4157                 break;
4158
4159         case GRO_DROP:
4160                 kfree_skb(skb);
4161                 break;
4162
4163         case GRO_MERGED_FREE:
4164                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4165                         kmem_cache_free(skbuff_head_cache, skb);
4166                 else
4167                         __kfree_skb(skb);
4168                 break;
4169
4170         case GRO_HELD:
4171         case GRO_MERGED:
4172                 break;
4173         }
4174
4175         return ret;
4176 }
4177
4178 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4179 {
4180         trace_napi_gro_receive_entry(skb);
4181
4182         skb_gro_reset_offset(skb);
4183
4184         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4185 }
4186 EXPORT_SYMBOL(napi_gro_receive);
4187
4188 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4189 {
4190         if (unlikely(skb->pfmemalloc)) {
4191                 consume_skb(skb);
4192                 return;
4193         }
4194         __skb_pull(skb, skb_headlen(skb));
4195         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4196         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4197         skb->vlan_tci = 0;
4198         skb->dev = napi->dev;
4199         skb->skb_iif = 0;
4200         skb->encapsulation = 0;
4201         skb_shinfo(skb)->gso_type = 0;
4202         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4203
4204         napi->skb = skb;
4205 }
4206
4207 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4208 {
4209         struct sk_buff *skb = napi->skb;
4210
4211         if (!skb) {
4212                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4213                 napi->skb = skb;
4214         }
4215         return skb;
4216 }
4217 EXPORT_SYMBOL(napi_get_frags);
4218
4219 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4220                                       struct sk_buff *skb,
4221                                       gro_result_t ret)
4222 {
4223         switch (ret) {
4224         case GRO_NORMAL:
4225         case GRO_HELD:
4226                 __skb_push(skb, ETH_HLEN);
4227                 skb->protocol = eth_type_trans(skb, skb->dev);
4228                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4229                         ret = GRO_DROP;
4230                 break;
4231
4232         case GRO_DROP:
4233         case GRO_MERGED_FREE:
4234                 napi_reuse_skb(napi, skb);
4235                 break;
4236
4237         case GRO_MERGED:
4238                 break;
4239         }
4240
4241         return ret;
4242 }
4243
4244 /* Upper GRO stack assumes network header starts at gro_offset=0
4245  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4246  * We copy ethernet header into skb->data to have a common layout.
4247  */
4248 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4249 {
4250         struct sk_buff *skb = napi->skb;
4251         const struct ethhdr *eth;
4252         unsigned int hlen = sizeof(*eth);
4253
4254         napi->skb = NULL;
4255
4256         skb_reset_mac_header(skb);
4257         skb_gro_reset_offset(skb);
4258
4259         eth = skb_gro_header_fast(skb, 0);
4260         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4261                 eth = skb_gro_header_slow(skb, hlen, 0);
4262                 if (unlikely(!eth)) {
4263                         napi_reuse_skb(napi, skb);
4264                         return NULL;
4265                 }
4266         } else {
4267                 gro_pull_from_frag0(skb, hlen);
4268                 NAPI_GRO_CB(skb)->frag0 += hlen;
4269                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4270         }
4271         __skb_pull(skb, hlen);
4272
4273         /*
4274          * This works because the only protocols we care about don't require
4275          * special handling.
4276          * We'll fix it up properly in napi_frags_finish()
4277          */
4278         skb->protocol = eth->h_proto;
4279
4280         return skb;
4281 }
4282
4283 gro_result_t napi_gro_frags(struct napi_struct *napi)
4284 {
4285         struct sk_buff *skb = napi_frags_skb(napi);
4286
4287         if (!skb)
4288                 return GRO_DROP;
4289
4290         trace_napi_gro_frags_entry(skb);
4291
4292         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4293 }
4294 EXPORT_SYMBOL(napi_gro_frags);
4295
4296 /* Compute the checksum from gro_offset and return the folded value
4297  * after adding in any pseudo checksum.
4298  */
4299 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4300 {
4301         __wsum wsum;
4302         __sum16 sum;
4303
4304         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4305
4306         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4307         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4308         if (likely(!sum)) {
4309                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4310                     !skb->csum_complete_sw)
4311                         netdev_rx_csum_fault(skb->dev);
4312         }
4313
4314         NAPI_GRO_CB(skb)->csum = wsum;
4315         NAPI_GRO_CB(skb)->csum_valid = 1;
4316
4317         return sum;
4318 }
4319 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4320
4321 /*
4322  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4323  * Note: called with local irq disabled, but exits with local irq enabled.
4324  */
4325 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4326 {
4327 #ifdef CONFIG_RPS
4328         struct softnet_data *remsd = sd->rps_ipi_list;
4329
4330         if (remsd) {
4331                 sd->rps_ipi_list = NULL;
4332
4333                 local_irq_enable();
4334
4335                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4336                 while (remsd) {
4337                         struct softnet_data *next = remsd->rps_ipi_next;
4338
4339                         if (cpu_online(remsd->cpu))
4340                                 smp_call_function_single_async(remsd->cpu,
4341                                                            &remsd->csd);
4342                         remsd = next;
4343                 }
4344         } else
4345 #endif
4346                 local_irq_enable();
4347 }
4348
4349 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4350 {
4351 #ifdef CONFIG_RPS
4352         return sd->rps_ipi_list != NULL;
4353 #else
4354         return false;
4355 #endif
4356 }
4357
4358 static int process_backlog(struct napi_struct *napi, int quota)
4359 {
4360         int work = 0;
4361         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4362
4363         /* Check if we have pending ipi, its better to send them now,
4364          * not waiting net_rx_action() end.
4365          */
4366         if (sd_has_rps_ipi_waiting(sd)) {
4367                 local_irq_disable();
4368                 net_rps_action_and_irq_enable(sd);
4369         }
4370
4371         napi->weight = weight_p;
4372         local_irq_disable();
4373         while (1) {
4374                 struct sk_buff *skb;
4375
4376                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4377                         local_irq_enable();
4378                         __netif_receive_skb(skb);
4379                         local_irq_disable();
4380                         input_queue_head_incr(sd);
4381                         if (++work >= quota) {
4382                                 local_irq_enable();
4383                                 return work;
4384                         }
4385                 }
4386
4387                 rps_lock(sd);
4388                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4389                         /*
4390                          * Inline a custom version of __napi_complete().
4391                          * only current cpu owns and manipulates this napi,
4392                          * and NAPI_STATE_SCHED is the only possible flag set
4393                          * on backlog.
4394                          * We can use a plain write instead of clear_bit(),
4395                          * and we dont need an smp_mb() memory barrier.
4396                          */
4397                         napi->state = 0;
4398                         rps_unlock(sd);
4399
4400                         break;
4401                 }
4402
4403                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4404                                            &sd->process_queue);
4405                 rps_unlock(sd);
4406         }
4407         local_irq_enable();
4408
4409         return work;
4410 }
4411
4412 /**
4413  * __napi_schedule - schedule for receive
4414  * @n: entry to schedule
4415  *
4416  * The entry's receive function will be scheduled to run.
4417  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4418  */
4419 void __napi_schedule(struct napi_struct *n)
4420 {
4421         unsigned long flags;
4422
4423         local_irq_save(flags);
4424         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4425         local_irq_restore(flags);
4426 }
4427 EXPORT_SYMBOL(__napi_schedule);
4428
4429 /**
4430  * __napi_schedule_irqoff - schedule for receive
4431  * @n: entry to schedule
4432  *
4433  * Variant of __napi_schedule() assuming hard irqs are masked
4434  */
4435 void __napi_schedule_irqoff(struct napi_struct *n)
4436 {
4437         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4438 }
4439 EXPORT_SYMBOL(__napi_schedule_irqoff);
4440
4441 void __napi_complete(struct napi_struct *n)
4442 {
4443         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4444
4445         list_del_init(&n->poll_list);
4446         smp_mb__before_atomic();
4447         clear_bit(NAPI_STATE_SCHED, &n->state);
4448 }
4449 EXPORT_SYMBOL(__napi_complete);
4450
4451 void napi_complete_done(struct napi_struct *n, int work_done)
4452 {
4453         unsigned long flags;
4454
4455         /*
4456          * don't let napi dequeue from the cpu poll list
4457          * just in case its running on a different cpu
4458          */
4459         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4460                 return;
4461
4462         if (n->gro_list) {
4463                 unsigned long timeout = 0;
4464
4465                 if (work_done)
4466                         timeout = n->dev->gro_flush_timeout;
4467
4468                 if (timeout)
4469                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4470                                       HRTIMER_MODE_REL_PINNED);
4471                 else
4472                         napi_gro_flush(n, false);
4473         }
4474         if (likely(list_empty(&n->poll_list))) {
4475                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4476         } else {
4477                 /* If n->poll_list is not empty, we need to mask irqs */
4478                 local_irq_save(flags);
4479                 __napi_complete(n);
4480                 local_irq_restore(flags);
4481         }
4482 }
4483 EXPORT_SYMBOL(napi_complete_done);
4484
4485 /* must be called under rcu_read_lock(), as we dont take a reference */
4486 struct napi_struct *napi_by_id(unsigned int napi_id)
4487 {
4488         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4489         struct napi_struct *napi;
4490
4491         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4492                 if (napi->napi_id == napi_id)
4493                         return napi;
4494
4495         return NULL;
4496 }
4497 EXPORT_SYMBOL_GPL(napi_by_id);
4498
4499 void napi_hash_add(struct napi_struct *napi)
4500 {
4501         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4502
4503                 spin_lock(&napi_hash_lock);
4504
4505                 /* 0 is not a valid id, we also skip an id that is taken
4506                  * we expect both events to be extremely rare
4507                  */
4508                 napi->napi_id = 0;
4509                 while (!napi->napi_id) {
4510                         napi->napi_id = ++napi_gen_id;
4511                         if (napi_by_id(napi->napi_id))
4512                                 napi->napi_id = 0;
4513                 }
4514
4515                 hlist_add_head_rcu(&napi->napi_hash_node,
4516                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4517
4518                 spin_unlock(&napi_hash_lock);
4519         }
4520 }
4521 EXPORT_SYMBOL_GPL(napi_hash_add);
4522
4523 /* Warning : caller is responsible to make sure rcu grace period
4524  * is respected before freeing memory containing @napi
4525  */
4526 void napi_hash_del(struct napi_struct *napi)
4527 {
4528         spin_lock(&napi_hash_lock);
4529
4530         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4531                 hlist_del_rcu(&napi->napi_hash_node);
4532
4533         spin_unlock(&napi_hash_lock);
4534 }
4535 EXPORT_SYMBOL_GPL(napi_hash_del);
4536
4537 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4538 {
4539         struct napi_struct *napi;
4540
4541         napi = container_of(timer, struct napi_struct, timer);
4542         if (napi->gro_list)
4543                 napi_schedule(napi);
4544
4545         return HRTIMER_NORESTART;
4546 }
4547
4548 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4549                     int (*poll)(struct napi_struct *, int), int weight)
4550 {
4551         INIT_LIST_HEAD(&napi->poll_list);
4552         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4553         napi->timer.function = napi_watchdog;
4554         napi->gro_count = 0;
4555         napi->gro_list = NULL;
4556         napi->skb = NULL;
4557         napi->poll = poll;
4558         if (weight > NAPI_POLL_WEIGHT)
4559                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4560                             weight, dev->name);
4561         napi->weight = weight;
4562         list_add(&napi->dev_list, &dev->napi_list);
4563         napi->dev = dev;
4564 #ifdef CONFIG_NETPOLL
4565         spin_lock_init(&napi->poll_lock);
4566         napi->poll_owner = -1;
4567 #endif
4568         set_bit(NAPI_STATE_SCHED, &napi->state);
4569 }
4570 EXPORT_SYMBOL(netif_napi_add);
4571
4572 void napi_disable(struct napi_struct *n)
4573 {
4574         might_sleep();
4575         set_bit(NAPI_STATE_DISABLE, &n->state);
4576
4577         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4578                 msleep(1);
4579
4580         hrtimer_cancel(&n->timer);
4581
4582         clear_bit(NAPI_STATE_DISABLE, &n->state);
4583 }
4584 EXPORT_SYMBOL(napi_disable);
4585
4586 void netif_napi_del(struct napi_struct *napi)
4587 {
4588         list_del_init(&napi->dev_list);
4589         napi_free_frags(napi);
4590
4591         kfree_skb_list(napi->gro_list);
4592         napi->gro_list = NULL;
4593         napi->gro_count = 0;
4594 }
4595 EXPORT_SYMBOL(netif_napi_del);
4596
4597 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4598 {
4599         void *have;
4600         int work, weight;
4601
4602         list_del_init(&n->poll_list);
4603
4604         have = netpoll_poll_lock(n);
4605
4606         weight = n->weight;
4607
4608         /* This NAPI_STATE_SCHED test is for avoiding a race
4609          * with netpoll's poll_napi().  Only the entity which
4610          * obtains the lock and sees NAPI_STATE_SCHED set will
4611          * actually make the ->poll() call.  Therefore we avoid
4612          * accidentally calling ->poll() when NAPI is not scheduled.
4613          */
4614         work = 0;
4615         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4616                 work = n->poll(n, weight);
4617                 trace_napi_poll(n);
4618         }
4619
4620         WARN_ON_ONCE(work > weight);
4621
4622         if (likely(work < weight))
4623                 goto out_unlock;
4624
4625         /* Drivers must not modify the NAPI state if they
4626          * consume the entire weight.  In such cases this code
4627          * still "owns" the NAPI instance and therefore can
4628          * move the instance around on the list at-will.
4629          */
4630         if (unlikely(napi_disable_pending(n))) {
4631                 napi_complete(n);
4632                 goto out_unlock;
4633         }
4634
4635         if (n->gro_list) {
4636                 /* flush too old packets
4637                  * If HZ < 1000, flush all packets.
4638                  */
4639                 napi_gro_flush(n, HZ >= 1000);
4640         }
4641
4642         /* Some drivers may have called napi_schedule
4643          * prior to exhausting their budget.
4644          */
4645         if (unlikely(!list_empty(&n->poll_list))) {
4646                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4647                              n->dev ? n->dev->name : "backlog");
4648                 goto out_unlock;
4649         }
4650
4651         list_add_tail(&n->poll_list, repoll);
4652
4653 out_unlock:
4654         netpoll_poll_unlock(have);
4655
4656         return work;
4657 }
4658
4659 static void net_rx_action(struct softirq_action *h)
4660 {
4661         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4662         unsigned long time_limit = jiffies + 2;
4663         int budget = netdev_budget;
4664         LIST_HEAD(list);
4665         LIST_HEAD(repoll);
4666
4667         local_irq_disable();
4668         list_splice_init(&sd->poll_list, &list);
4669         local_irq_enable();
4670
4671         for (;;) {
4672                 struct napi_struct *n;
4673
4674                 if (list_empty(&list)) {
4675                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4676                                 return;
4677                         break;
4678                 }
4679
4680                 n = list_first_entry(&list, struct napi_struct, poll_list);
4681                 budget -= napi_poll(n, &repoll);
4682
4683                 /* If softirq window is exhausted then punt.
4684                  * Allow this to run for 2 jiffies since which will allow
4685                  * an average latency of 1.5/HZ.
4686                  */
4687                 if (unlikely(budget <= 0 ||
4688                              time_after_eq(jiffies, time_limit))) {
4689                         sd->time_squeeze++;
4690                         break;
4691                 }
4692         }
4693
4694         local_irq_disable();
4695
4696         list_splice_tail_init(&sd->poll_list, &list);
4697         list_splice_tail(&repoll, &list);
4698         list_splice(&list, &sd->poll_list);
4699         if (!list_empty(&sd->poll_list))
4700                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4701
4702         net_rps_action_and_irq_enable(sd);
4703 }
4704
4705 struct netdev_adjacent {
4706         struct net_device *dev;
4707
4708         /* upper master flag, there can only be one master device per list */
4709         bool master;
4710
4711         /* counter for the number of times this device was added to us */
4712         u16 ref_nr;
4713
4714         /* private field for the users */
4715         void *private;
4716
4717         struct list_head list;
4718         struct rcu_head rcu;
4719 };
4720
4721 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4722                                                  struct net_device *adj_dev,
4723                                                  struct list_head *adj_list)
4724 {
4725         struct netdev_adjacent *adj;
4726
4727         list_for_each_entry(adj, adj_list, list) {
4728                 if (adj->dev == adj_dev)
4729                         return adj;
4730         }
4731         return NULL;
4732 }
4733
4734 /**
4735  * netdev_has_upper_dev - Check if device is linked to an upper device
4736  * @dev: device
4737  * @upper_dev: upper device to check
4738  *
4739  * Find out if a device is linked to specified upper device and return true
4740  * in case it is. Note that this checks only immediate upper device,
4741  * not through a complete stack of devices. The caller must hold the RTNL lock.
4742  */
4743 bool netdev_has_upper_dev(struct net_device *dev,
4744                           struct net_device *upper_dev)
4745 {
4746         ASSERT_RTNL();
4747
4748         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4749 }
4750 EXPORT_SYMBOL(netdev_has_upper_dev);
4751
4752 /**
4753  * netdev_has_any_upper_dev - Check if device is linked to some device
4754  * @dev: device
4755  *
4756  * Find out if a device is linked to an upper device and return true in case
4757  * it is. The caller must hold the RTNL lock.
4758  */
4759 static bool netdev_has_any_upper_dev(struct net_device *dev)
4760 {
4761         ASSERT_RTNL();
4762
4763         return !list_empty(&dev->all_adj_list.upper);
4764 }
4765
4766 /**
4767  * netdev_master_upper_dev_get - Get master upper device
4768  * @dev: device
4769  *
4770  * Find a master upper device and return pointer to it or NULL in case
4771  * it's not there. The caller must hold the RTNL lock.
4772  */
4773 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4774 {
4775         struct netdev_adjacent *upper;
4776
4777         ASSERT_RTNL();
4778
4779         if (list_empty(&dev->adj_list.upper))
4780                 return NULL;
4781
4782         upper = list_first_entry(&dev->adj_list.upper,
4783                                  struct netdev_adjacent, list);
4784         if (likely(upper->master))
4785                 return upper->dev;
4786         return NULL;
4787 }
4788 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4789
4790 void *netdev_adjacent_get_private(struct list_head *adj_list)
4791 {
4792         struct netdev_adjacent *adj;
4793
4794         adj = list_entry(adj_list, struct netdev_adjacent, list);
4795
4796         return adj->private;
4797 }
4798 EXPORT_SYMBOL(netdev_adjacent_get_private);
4799
4800 /**
4801  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4802  * @dev: device
4803  * @iter: list_head ** of the current position
4804  *
4805  * Gets the next device from the dev's upper list, starting from iter
4806  * position. The caller must hold RCU read lock.
4807  */
4808 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4809                                                  struct list_head **iter)
4810 {
4811         struct netdev_adjacent *upper;
4812
4813         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4814
4815         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4816
4817         if (&upper->list == &dev->adj_list.upper)
4818                 return NULL;
4819
4820         *iter = &upper->list;
4821
4822         return upper->dev;
4823 }
4824 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4825
4826 /**
4827  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4828  * @dev: device
4829  * @iter: list_head ** of the current position
4830  *
4831  * Gets the next device from the dev's upper list, starting from iter
4832  * position. The caller must hold RCU read lock.
4833  */
4834 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4835                                                      struct list_head **iter)
4836 {
4837         struct netdev_adjacent *upper;
4838
4839         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4840
4841         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4842
4843         if (&upper->list == &dev->all_adj_list.upper)
4844                 return NULL;
4845
4846         *iter = &upper->list;
4847
4848         return upper->dev;
4849 }
4850 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4851
4852 /**
4853  * netdev_lower_get_next_private - Get the next ->private from the
4854  *                                 lower neighbour list
4855  * @dev: device
4856  * @iter: list_head ** of the current position
4857  *
4858  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4859  * list, starting from iter position. The caller must hold either hold the
4860  * RTNL lock or its own locking that guarantees that the neighbour lower
4861  * list will remain unchainged.
4862  */
4863 void *netdev_lower_get_next_private(struct net_device *dev,
4864                                     struct list_head **iter)
4865 {
4866         struct netdev_adjacent *lower;
4867
4868         lower = list_entry(*iter, struct netdev_adjacent, list);
4869
4870         if (&lower->list == &dev->adj_list.lower)
4871                 return NULL;
4872
4873         *iter = lower->list.next;
4874
4875         return lower->private;
4876 }
4877 EXPORT_SYMBOL(netdev_lower_get_next_private);
4878
4879 /**
4880  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4881  *                                     lower neighbour list, RCU
4882  *                                     variant
4883  * @dev: device
4884  * @iter: list_head ** of the current position
4885  *
4886  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4887  * list, starting from iter position. The caller must hold RCU read lock.
4888  */
4889 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4890                                         struct list_head **iter)
4891 {
4892         struct netdev_adjacent *lower;
4893
4894         WARN_ON_ONCE(!rcu_read_lock_held());
4895
4896         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4897
4898         if (&lower->list == &dev->adj_list.lower)
4899                 return NULL;
4900
4901         *iter = &lower->list;
4902
4903         return lower->private;
4904 }
4905 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4906
4907 /**
4908  * netdev_lower_get_next - Get the next device from the lower neighbour
4909  *                         list
4910  * @dev: device
4911  * @iter: list_head ** of the current position
4912  *
4913  * Gets the next netdev_adjacent from the dev's lower neighbour
4914  * list, starting from iter position. The caller must hold RTNL lock or
4915  * its own locking that guarantees that the neighbour lower
4916  * list will remain unchainged.
4917  */
4918 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4919 {
4920         struct netdev_adjacent *lower;
4921
4922         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4923
4924         if (&lower->list == &dev->adj_list.lower)
4925                 return NULL;
4926
4927         *iter = &lower->list;
4928
4929         return lower->dev;
4930 }
4931 EXPORT_SYMBOL(netdev_lower_get_next);
4932
4933 /**
4934  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4935  *                                     lower neighbour list, RCU
4936  *                                     variant
4937  * @dev: device
4938  *
4939  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4940  * list. The caller must hold RCU read lock.
4941  */
4942 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4943 {
4944         struct netdev_adjacent *lower;
4945
4946         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4947                         struct netdev_adjacent, list);
4948         if (lower)
4949                 return lower->private;
4950         return NULL;
4951 }
4952 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4953
4954 /**
4955  * netdev_master_upper_dev_get_rcu - Get master upper device
4956  * @dev: device
4957  *
4958  * Find a master upper device and return pointer to it or NULL in case
4959  * it's not there. The caller must hold the RCU read lock.
4960  */
4961 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4962 {
4963         struct netdev_adjacent *upper;
4964
4965         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4966                                        struct netdev_adjacent, list);
4967         if (upper && likely(upper->master))
4968                 return upper->dev;
4969         return NULL;
4970 }
4971 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4972
4973 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4974                               struct net_device *adj_dev,
4975                               struct list_head *dev_list)
4976 {
4977         char linkname[IFNAMSIZ+7];
4978         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4979                 "upper_%s" : "lower_%s", adj_dev->name);
4980         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4981                                  linkname);
4982 }
4983 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4984                                char *name,
4985                                struct list_head *dev_list)
4986 {
4987         char linkname[IFNAMSIZ+7];
4988         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4989                 "upper_%s" : "lower_%s", name);
4990         sysfs_remove_link(&(dev->dev.kobj), linkname);
4991 }
4992
4993 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4994                                                  struct net_device *adj_dev,
4995                                                  struct list_head *dev_list)
4996 {
4997         return (dev_list == &dev->adj_list.upper ||
4998                 dev_list == &dev->adj_list.lower) &&
4999                 net_eq(dev_net(dev), dev_net(adj_dev));
5000 }
5001
5002 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5003                                         struct net_device *adj_dev,
5004                                         struct list_head *dev_list,
5005                                         void *private, bool master)
5006 {
5007         struct netdev_adjacent *adj;
5008         int ret;
5009
5010         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5011
5012         if (adj) {
5013                 adj->ref_nr++;
5014                 return 0;
5015         }
5016
5017         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5018         if (!adj)
5019                 return -ENOMEM;
5020
5021         adj->dev = adj_dev;
5022         adj->master = master;
5023         adj->ref_nr = 1;
5024         adj->private = private;
5025         dev_hold(adj_dev);
5026
5027         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5028                  adj_dev->name, dev->name, adj_dev->name);
5029
5030         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5031                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5032                 if (ret)
5033                         goto free_adj;
5034         }
5035
5036         /* Ensure that master link is always the first item in list. */
5037         if (master) {
5038                 ret = sysfs_create_link(&(dev->dev.kobj),
5039                                         &(adj_dev->dev.kobj), "master");
5040                 if (ret)
5041                         goto remove_symlinks;
5042
5043                 list_add_rcu(&adj->list, dev_list);
5044         } else {
5045                 list_add_tail_rcu(&adj->list, dev_list);
5046         }
5047
5048         return 0;
5049
5050 remove_symlinks:
5051         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5052                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5053 free_adj:
5054         kfree(adj);
5055         dev_put(adj_dev);
5056
5057         return ret;
5058 }
5059
5060 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5061                                          struct net_device *adj_dev,
5062                                          struct list_head *dev_list)
5063 {
5064         struct netdev_adjacent *adj;
5065
5066         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5067
5068         if (!adj) {
5069                 pr_err("tried to remove device %s from %s\n",
5070                        dev->name, adj_dev->name);
5071                 BUG();
5072         }
5073
5074         if (adj->ref_nr > 1) {
5075                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5076                          adj->ref_nr-1);
5077                 adj->ref_nr--;
5078                 return;
5079         }
5080
5081         if (adj->master)
5082                 sysfs_remove_link(&(dev->dev.kobj), "master");
5083
5084         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5085                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5086
5087         list_del_rcu(&adj->list);
5088         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5089                  adj_dev->name, dev->name, adj_dev->name);
5090         dev_put(adj_dev);
5091         kfree_rcu(adj, rcu);
5092 }
5093
5094 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5095                                             struct net_device *upper_dev,
5096                                             struct list_head *up_list,
5097                                             struct list_head *down_list,
5098                                             void *private, bool master)
5099 {
5100         int ret;
5101
5102         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5103                                            master);
5104         if (ret)
5105                 return ret;
5106
5107         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5108                                            false);
5109         if (ret) {
5110                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5111                 return ret;
5112         }
5113
5114         return 0;
5115 }
5116
5117 static int __netdev_adjacent_dev_link(struct net_device *dev,
5118                                       struct net_device *upper_dev)
5119 {
5120         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5121                                                 &dev->all_adj_list.upper,
5122                                                 &upper_dev->all_adj_list.lower,
5123                                                 NULL, false);
5124 }
5125
5126 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5127                                                struct net_device *upper_dev,
5128                                                struct list_head *up_list,
5129                                                struct list_head *down_list)
5130 {
5131         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5132         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5133 }
5134
5135 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5136                                          struct net_device *upper_dev)
5137 {
5138         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5139                                            &dev->all_adj_list.upper,
5140                                            &upper_dev->all_adj_list.lower);
5141 }
5142
5143 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5144                                                 struct net_device *upper_dev,
5145                                                 void *private, bool master)
5146 {
5147         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5148
5149         if (ret)
5150                 return ret;
5151
5152         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5153                                                &dev->adj_list.upper,
5154                                                &upper_dev->adj_list.lower,
5155                                                private, master);
5156         if (ret) {
5157                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5158                 return ret;
5159         }
5160
5161         return 0;
5162 }
5163
5164 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5165                                                    struct net_device *upper_dev)
5166 {
5167         __netdev_adjacent_dev_unlink(dev, upper_dev);
5168         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5169                                            &dev->adj_list.upper,
5170                                            &upper_dev->adj_list.lower);
5171 }
5172
5173 static int __netdev_upper_dev_link(struct net_device *dev,
5174                                    struct net_device *upper_dev, bool master,
5175                                    void *private)
5176 {
5177         struct netdev_adjacent *i, *j, *to_i, *to_j;
5178         int ret = 0;
5179
5180         ASSERT_RTNL();
5181
5182         if (dev == upper_dev)
5183                 return -EBUSY;
5184
5185         /* To prevent loops, check if dev is not upper device to upper_dev. */
5186         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5187                 return -EBUSY;
5188
5189         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5190                 return -EEXIST;
5191
5192         if (master && netdev_master_upper_dev_get(dev))
5193                 return -EBUSY;
5194
5195         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5196                                                    master);
5197         if (ret)
5198                 return ret;
5199
5200         /* Now that we linked these devs, make all the upper_dev's
5201          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5202          * versa, and don't forget the devices itself. All of these
5203          * links are non-neighbours.
5204          */
5205         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5206                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5207                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5208                                  i->dev->name, j->dev->name);
5209                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5210                         if (ret)
5211                                 goto rollback_mesh;
5212                 }
5213         }
5214
5215         /* add dev to every upper_dev's upper device */
5216         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5217                 pr_debug("linking %s's upper device %s with %s\n",
5218                          upper_dev->name, i->dev->name, dev->name);
5219                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5220                 if (ret)
5221                         goto rollback_upper_mesh;
5222         }
5223
5224         /* add upper_dev to every dev's lower device */
5225         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5226                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5227                          i->dev->name, upper_dev->name);
5228                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5229                 if (ret)
5230                         goto rollback_lower_mesh;
5231         }
5232
5233         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5234         return 0;
5235
5236 rollback_lower_mesh:
5237         to_i = i;
5238         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5239                 if (i == to_i)
5240                         break;
5241                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5242         }
5243
5244         i = NULL;
5245
5246 rollback_upper_mesh:
5247         to_i = i;
5248         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5249                 if (i == to_i)
5250                         break;
5251                 __netdev_adjacent_dev_unlink(dev, i->dev);
5252         }
5253
5254         i = j = NULL;
5255
5256 rollback_mesh:
5257         to_i = i;
5258         to_j = j;
5259         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5260                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5261                         if (i == to_i && j == to_j)
5262                                 break;
5263                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5264                 }
5265                 if (i == to_i)
5266                         break;
5267         }
5268
5269         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5270
5271         return ret;
5272 }
5273
5274 /**
5275  * netdev_upper_dev_link - Add a link to the upper device
5276  * @dev: device
5277  * @upper_dev: new upper device
5278  *
5279  * Adds a link to device which is upper to this one. The caller must hold
5280  * the RTNL lock. On a failure a negative errno code is returned.
5281  * On success the reference counts are adjusted and the function
5282  * returns zero.
5283  */
5284 int netdev_upper_dev_link(struct net_device *dev,
5285                           struct net_device *upper_dev)
5286 {
5287         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5288 }
5289 EXPORT_SYMBOL(netdev_upper_dev_link);
5290
5291 /**
5292  * netdev_master_upper_dev_link - Add a master link to the upper device
5293  * @dev: device
5294  * @upper_dev: new upper device
5295  *
5296  * Adds a link to device which is upper to this one. In this case, only
5297  * one master upper device can be linked, although other non-master devices
5298  * might be linked as well. The caller must hold the RTNL lock.
5299  * On a failure a negative errno code is returned. On success the reference
5300  * counts are adjusted and the function returns zero.
5301  */
5302 int netdev_master_upper_dev_link(struct net_device *dev,
5303                                  struct net_device *upper_dev)
5304 {
5305         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5306 }
5307 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5308
5309 int netdev_master_upper_dev_link_private(struct net_device *dev,
5310                                          struct net_device *upper_dev,
5311                                          void *private)
5312 {
5313         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5314 }
5315 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5316
5317 /**
5318  * netdev_upper_dev_unlink - Removes a link to upper device
5319  * @dev: device
5320  * @upper_dev: new upper device
5321  *
5322  * Removes a link to device which is upper to this one. The caller must hold
5323  * the RTNL lock.
5324  */
5325 void netdev_upper_dev_unlink(struct net_device *dev,
5326                              struct net_device *upper_dev)
5327 {
5328         struct netdev_adjacent *i, *j;
5329         ASSERT_RTNL();
5330
5331         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5332
5333         /* Here is the tricky part. We must remove all dev's lower
5334          * devices from all upper_dev's upper devices and vice
5335          * versa, to maintain the graph relationship.
5336          */
5337         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5338                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5339                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5340
5341         /* remove also the devices itself from lower/upper device
5342          * list
5343          */
5344         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5345                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5346
5347         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5348                 __netdev_adjacent_dev_unlink(dev, i->dev);
5349
5350         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5351 }
5352 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5353
5354 /**
5355  * netdev_bonding_info_change - Dispatch event about slave change
5356  * @dev: device
5357  * @bonding_info: info to dispatch
5358  *
5359  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5360  * The caller must hold the RTNL lock.
5361  */
5362 void netdev_bonding_info_change(struct net_device *dev,
5363                                 struct netdev_bonding_info *bonding_info)
5364 {
5365         struct netdev_notifier_bonding_info     info;
5366
5367         memcpy(&info.bonding_info, bonding_info,
5368                sizeof(struct netdev_bonding_info));
5369         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5370                                       &info.info);
5371 }
5372 EXPORT_SYMBOL(netdev_bonding_info_change);
5373
5374 static void netdev_adjacent_add_links(struct net_device *dev)
5375 {
5376         struct netdev_adjacent *iter;
5377
5378         struct net *net = dev_net(dev);
5379
5380         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5381                 if (!net_eq(net,dev_net(iter->dev)))
5382                         continue;
5383                 netdev_adjacent_sysfs_add(iter->dev, dev,
5384                                           &iter->dev->adj_list.lower);
5385                 netdev_adjacent_sysfs_add(dev, iter->dev,
5386                                           &dev->adj_list.upper);
5387         }
5388
5389         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5390                 if (!net_eq(net,dev_net(iter->dev)))
5391                         continue;
5392                 netdev_adjacent_sysfs_add(iter->dev, dev,
5393                                           &iter->dev->adj_list.upper);
5394                 netdev_adjacent_sysfs_add(dev, iter->dev,
5395                                           &dev->adj_list.lower);
5396         }
5397 }
5398
5399 static void netdev_adjacent_del_links(struct net_device *dev)
5400 {
5401         struct netdev_adjacent *iter;
5402
5403         struct net *net = dev_net(dev);
5404
5405         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5406                 if (!net_eq(net,dev_net(iter->dev)))
5407                         continue;
5408                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5409                                           &iter->dev->adj_list.lower);
5410                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5411                                           &dev->adj_list.upper);
5412         }
5413
5414         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5415                 if (!net_eq(net,dev_net(iter->dev)))
5416                         continue;
5417                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5418                                           &iter->dev->adj_list.upper);
5419                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5420                                           &dev->adj_list.lower);
5421         }
5422 }
5423
5424 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5425 {
5426         struct netdev_adjacent *iter;
5427
5428         struct net *net = dev_net(dev);
5429
5430         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5431                 if (!net_eq(net,dev_net(iter->dev)))
5432                         continue;
5433                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5434                                           &iter->dev->adj_list.lower);
5435                 netdev_adjacent_sysfs_add(iter->dev, dev,
5436                                           &iter->dev->adj_list.lower);
5437         }
5438
5439         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5440                 if (!net_eq(net,dev_net(iter->dev)))
5441                         continue;
5442                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5443                                           &iter->dev->adj_list.upper);
5444                 netdev_adjacent_sysfs_add(iter->dev, dev,
5445                                           &iter->dev->adj_list.upper);
5446         }
5447 }
5448
5449 void *netdev_lower_dev_get_private(struct net_device *dev,
5450                                    struct net_device *lower_dev)
5451 {
5452         struct netdev_adjacent *lower;
5453
5454         if (!lower_dev)
5455                 return NULL;
5456         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5457         if (!lower)
5458                 return NULL;
5459
5460         return lower->private;
5461 }
5462 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5463
5464
5465 int dev_get_nest_level(struct net_device *dev,
5466                        bool (*type_check)(struct net_device *dev))
5467 {
5468         struct net_device *lower = NULL;
5469         struct list_head *iter;
5470         int max_nest = -1;
5471         int nest;
5472
5473         ASSERT_RTNL();
5474
5475         netdev_for_each_lower_dev(dev, lower, iter) {
5476                 nest = dev_get_nest_level(lower, type_check);
5477                 if (max_nest < nest)
5478                         max_nest = nest;
5479         }
5480
5481         if (type_check(dev))
5482                 max_nest++;
5483
5484         return max_nest;
5485 }
5486 EXPORT_SYMBOL(dev_get_nest_level);
5487
5488 static void dev_change_rx_flags(struct net_device *dev, int flags)
5489 {
5490         const struct net_device_ops *ops = dev->netdev_ops;
5491
5492         if (ops->ndo_change_rx_flags)
5493                 ops->ndo_change_rx_flags(dev, flags);
5494 }
5495
5496 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5497 {
5498         unsigned int old_flags = dev->flags;
5499         kuid_t uid;
5500         kgid_t gid;
5501
5502         ASSERT_RTNL();
5503
5504         dev->flags |= IFF_PROMISC;
5505         dev->promiscuity += inc;
5506         if (dev->promiscuity == 0) {
5507                 /*
5508                  * Avoid overflow.
5509                  * If inc causes overflow, untouch promisc and return error.
5510                  */
5511                 if (inc < 0)
5512                         dev->flags &= ~IFF_PROMISC;
5513                 else {
5514                         dev->promiscuity -= inc;
5515                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5516                                 dev->name);
5517                         return -EOVERFLOW;
5518                 }
5519         }
5520         if (dev->flags != old_flags) {
5521                 pr_info("device %s %s promiscuous mode\n",
5522                         dev->name,
5523                         dev->flags & IFF_PROMISC ? "entered" : "left");
5524                 if (audit_enabled) {
5525                         current_uid_gid(&uid, &gid);
5526                         audit_log(current->audit_context, GFP_ATOMIC,
5527                                 AUDIT_ANOM_PROMISCUOUS,
5528                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5529                                 dev->name, (dev->flags & IFF_PROMISC),
5530                                 (old_flags & IFF_PROMISC),
5531                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5532                                 from_kuid(&init_user_ns, uid),
5533                                 from_kgid(&init_user_ns, gid),
5534                                 audit_get_sessionid(current));
5535                 }
5536
5537                 dev_change_rx_flags(dev, IFF_PROMISC);
5538         }
5539         if (notify)
5540                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5541         return 0;
5542 }
5543
5544 /**
5545  *      dev_set_promiscuity     - update promiscuity count on a device
5546  *      @dev: device
5547  *      @inc: modifier
5548  *
5549  *      Add or remove promiscuity from a device. While the count in the device
5550  *      remains above zero the interface remains promiscuous. Once it hits zero
5551  *      the device reverts back to normal filtering operation. A negative inc
5552  *      value is used to drop promiscuity on the device.
5553  *      Return 0 if successful or a negative errno code on error.
5554  */
5555 int dev_set_promiscuity(struct net_device *dev, int inc)
5556 {
5557         unsigned int old_flags = dev->flags;
5558         int err;
5559
5560         err = __dev_set_promiscuity(dev, inc, true);
5561         if (err < 0)
5562                 return err;
5563         if (dev->flags != old_flags)
5564                 dev_set_rx_mode(dev);
5565         return err;
5566 }
5567 EXPORT_SYMBOL(dev_set_promiscuity);
5568
5569 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5570 {
5571         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5572
5573         ASSERT_RTNL();
5574
5575         dev->flags |= IFF_ALLMULTI;
5576         dev->allmulti += inc;
5577         if (dev->allmulti == 0) {
5578                 /*
5579                  * Avoid overflow.
5580                  * If inc causes overflow, untouch allmulti and return error.
5581                  */
5582                 if (inc < 0)
5583                         dev->flags &= ~IFF_ALLMULTI;
5584                 else {
5585                         dev->allmulti -= inc;
5586                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5587                                 dev->name);
5588                         return -EOVERFLOW;
5589                 }
5590         }
5591         if (dev->flags ^ old_flags) {
5592                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5593                 dev_set_rx_mode(dev);
5594                 if (notify)
5595                         __dev_notify_flags(dev, old_flags,
5596                                            dev->gflags ^ old_gflags);
5597         }
5598         return 0;
5599 }
5600
5601 /**
5602  *      dev_set_allmulti        - update allmulti count on a device
5603  *      @dev: device
5604  *      @inc: modifier
5605  *
5606  *      Add or remove reception of all multicast frames to a device. While the
5607  *      count in the device remains above zero the interface remains listening
5608  *      to all interfaces. Once it hits zero the device reverts back to normal
5609  *      filtering operation. A negative @inc value is used to drop the counter
5610  *      when releasing a resource needing all multicasts.
5611  *      Return 0 if successful or a negative errno code on error.
5612  */
5613
5614 int dev_set_allmulti(struct net_device *dev, int inc)
5615 {
5616         return __dev_set_allmulti(dev, inc, true);
5617 }
5618 EXPORT_SYMBOL(dev_set_allmulti);
5619
5620 /*
5621  *      Upload unicast and multicast address lists to device and
5622  *      configure RX filtering. When the device doesn't support unicast
5623  *      filtering it is put in promiscuous mode while unicast addresses
5624  *      are present.
5625  */
5626 void __dev_set_rx_mode(struct net_device *dev)
5627 {
5628         const struct net_device_ops *ops = dev->netdev_ops;
5629
5630         /* dev_open will call this function so the list will stay sane. */
5631         if (!(dev->flags&IFF_UP))
5632                 return;
5633
5634         if (!netif_device_present(dev))
5635                 return;
5636
5637         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5638                 /* Unicast addresses changes may only happen under the rtnl,
5639                  * therefore calling __dev_set_promiscuity here is safe.
5640                  */
5641                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5642                         __dev_set_promiscuity(dev, 1, false);
5643                         dev->uc_promisc = true;
5644                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5645                         __dev_set_promiscuity(dev, -1, false);
5646                         dev->uc_promisc = false;
5647                 }
5648         }
5649
5650         if (ops->ndo_set_rx_mode)
5651                 ops->ndo_set_rx_mode(dev);
5652 }
5653
5654 void dev_set_rx_mode(struct net_device *dev)
5655 {
5656         netif_addr_lock_bh(dev);
5657         __dev_set_rx_mode(dev);
5658         netif_addr_unlock_bh(dev);
5659 }
5660
5661 /**
5662  *      dev_get_flags - get flags reported to userspace
5663  *      @dev: device
5664  *
5665  *      Get the combination of flag bits exported through APIs to userspace.
5666  */
5667 unsigned int dev_get_flags(const struct net_device *dev)
5668 {
5669         unsigned int flags;
5670
5671         flags = (dev->flags & ~(IFF_PROMISC |
5672                                 IFF_ALLMULTI |
5673                                 IFF_RUNNING |
5674                                 IFF_LOWER_UP |
5675                                 IFF_DORMANT)) |
5676                 (dev->gflags & (IFF_PROMISC |
5677                                 IFF_ALLMULTI));
5678
5679         if (netif_running(dev)) {
5680                 if (netif_oper_up(dev))
5681                         flags |= IFF_RUNNING;
5682                 if (netif_carrier_ok(dev))
5683                         flags |= IFF_LOWER_UP;
5684                 if (netif_dormant(dev))
5685                         flags |= IFF_DORMANT;
5686         }
5687
5688         return flags;
5689 }
5690 EXPORT_SYMBOL(dev_get_flags);
5691
5692 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5693 {
5694         unsigned int old_flags = dev->flags;
5695         int ret;
5696
5697         ASSERT_RTNL();
5698
5699         /*
5700          *      Set the flags on our device.
5701          */
5702
5703         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5704                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5705                                IFF_AUTOMEDIA)) |
5706                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5707                                     IFF_ALLMULTI));
5708
5709         /*
5710          *      Load in the correct multicast list now the flags have changed.
5711          */
5712
5713         if ((old_flags ^ flags) & IFF_MULTICAST)
5714                 dev_change_rx_flags(dev, IFF_MULTICAST);
5715
5716         dev_set_rx_mode(dev);
5717
5718         /*
5719          *      Have we downed the interface. We handle IFF_UP ourselves
5720          *      according to user attempts to set it, rather than blindly
5721          *      setting it.
5722          */
5723
5724         ret = 0;
5725         if ((old_flags ^ flags) & IFF_UP)
5726                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5727
5728         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5729                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5730                 unsigned int old_flags = dev->flags;
5731
5732                 dev->gflags ^= IFF_PROMISC;
5733
5734                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5735                         if (dev->flags != old_flags)
5736                                 dev_set_rx_mode(dev);
5737         }
5738
5739         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5740            is important. Some (broken) drivers set IFF_PROMISC, when
5741            IFF_ALLMULTI is requested not asking us and not reporting.
5742          */
5743         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5744                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5745
5746                 dev->gflags ^= IFF_ALLMULTI;
5747                 __dev_set_allmulti(dev, inc, false);
5748         }
5749
5750         return ret;
5751 }
5752
5753 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5754                         unsigned int gchanges)
5755 {
5756         unsigned int changes = dev->flags ^ old_flags;
5757
5758         if (gchanges)
5759                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5760
5761         if (changes & IFF_UP) {
5762                 if (dev->flags & IFF_UP)
5763                         call_netdevice_notifiers(NETDEV_UP, dev);
5764                 else
5765                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5766         }
5767
5768         if (dev->flags & IFF_UP &&
5769             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5770                 struct netdev_notifier_change_info change_info;
5771
5772                 change_info.flags_changed = changes;
5773                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5774                                               &change_info.info);
5775         }
5776 }
5777
5778 /**
5779  *      dev_change_flags - change device settings
5780  *      @dev: device
5781  *      @flags: device state flags
5782  *
5783  *      Change settings on device based state flags. The flags are
5784  *      in the userspace exported format.
5785  */
5786 int dev_change_flags(struct net_device *dev, unsigned int flags)
5787 {
5788         int ret;
5789         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5790
5791         ret = __dev_change_flags(dev, flags);
5792         if (ret < 0)
5793                 return ret;
5794
5795         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5796         __dev_notify_flags(dev, old_flags, changes);
5797         return ret;
5798 }
5799 EXPORT_SYMBOL(dev_change_flags);
5800
5801 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5802 {
5803         const struct net_device_ops *ops = dev->netdev_ops;
5804
5805         if (ops->ndo_change_mtu)
5806                 return ops->ndo_change_mtu(dev, new_mtu);
5807
5808         dev->mtu = new_mtu;
5809         return 0;
5810 }
5811
5812 /**
5813  *      dev_set_mtu - Change maximum transfer unit
5814  *      @dev: device
5815  *      @new_mtu: new transfer unit
5816  *
5817  *      Change the maximum transfer size of the network device.
5818  */
5819 int dev_set_mtu(struct net_device *dev, int new_mtu)
5820 {
5821         int err, orig_mtu;
5822
5823         if (new_mtu == dev->mtu)
5824                 return 0;
5825
5826         /*      MTU must be positive.    */
5827         if (new_mtu < 0)
5828                 return -EINVAL;
5829
5830         if (!netif_device_present(dev))
5831                 return -ENODEV;
5832
5833         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5834         err = notifier_to_errno(err);
5835         if (err)
5836                 return err;
5837
5838         orig_mtu = dev->mtu;
5839         err = __dev_set_mtu(dev, new_mtu);
5840
5841         if (!err) {
5842                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5843                 err = notifier_to_errno(err);
5844                 if (err) {
5845                         /* setting mtu back and notifying everyone again,
5846                          * so that they have a chance to revert changes.
5847                          */
5848                         __dev_set_mtu(dev, orig_mtu);
5849                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5850                 }
5851         }
5852         return err;
5853 }
5854 EXPORT_SYMBOL(dev_set_mtu);
5855
5856 /**
5857  *      dev_set_group - Change group this device belongs to
5858  *      @dev: device
5859  *      @new_group: group this device should belong to
5860  */
5861 void dev_set_group(struct net_device *dev, int new_group)
5862 {
5863         dev->group = new_group;
5864 }
5865 EXPORT_SYMBOL(dev_set_group);
5866
5867 /**
5868  *      dev_set_mac_address - Change Media Access Control Address
5869  *      @dev: device
5870  *      @sa: new address
5871  *
5872  *      Change the hardware (MAC) address of the device
5873  */
5874 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5875 {
5876         const struct net_device_ops *ops = dev->netdev_ops;
5877         int err;
5878
5879         if (!ops->ndo_set_mac_address)
5880                 return -EOPNOTSUPP;
5881         if (sa->sa_family != dev->type)
5882                 return -EINVAL;
5883         if (!netif_device_present(dev))
5884                 return -ENODEV;
5885         err = ops->ndo_set_mac_address(dev, sa);
5886         if (err)
5887                 return err;
5888         dev->addr_assign_type = NET_ADDR_SET;
5889         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5890         add_device_randomness(dev->dev_addr, dev->addr_len);
5891         return 0;
5892 }
5893 EXPORT_SYMBOL(dev_set_mac_address);
5894
5895 /**
5896  *      dev_change_carrier - Change device carrier
5897  *      @dev: device
5898  *      @new_carrier: new value
5899  *
5900  *      Change device carrier
5901  */
5902 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5903 {
5904         const struct net_device_ops *ops = dev->netdev_ops;
5905
5906         if (!ops->ndo_change_carrier)
5907                 return -EOPNOTSUPP;
5908         if (!netif_device_present(dev))
5909                 return -ENODEV;
5910         return ops->ndo_change_carrier(dev, new_carrier);
5911 }
5912 EXPORT_SYMBOL(dev_change_carrier);
5913
5914 /**
5915  *      dev_get_phys_port_id - Get device physical port ID
5916  *      @dev: device
5917  *      @ppid: port ID
5918  *
5919  *      Get device physical port ID
5920  */
5921 int dev_get_phys_port_id(struct net_device *dev,
5922                          struct netdev_phys_item_id *ppid)
5923 {
5924         const struct net_device_ops *ops = dev->netdev_ops;
5925
5926         if (!ops->ndo_get_phys_port_id)
5927                 return -EOPNOTSUPP;
5928         return ops->ndo_get_phys_port_id(dev, ppid);
5929 }
5930 EXPORT_SYMBOL(dev_get_phys_port_id);
5931
5932 /**
5933  *      dev_get_phys_port_name - Get device physical port name
5934  *      @dev: device
5935  *      @name: port name
5936  *
5937  *      Get device physical port name
5938  */
5939 int dev_get_phys_port_name(struct net_device *dev,
5940                            char *name, size_t len)
5941 {
5942         const struct net_device_ops *ops = dev->netdev_ops;
5943
5944         if (!ops->ndo_get_phys_port_name)
5945                 return -EOPNOTSUPP;
5946         return ops->ndo_get_phys_port_name(dev, name, len);
5947 }
5948 EXPORT_SYMBOL(dev_get_phys_port_name);
5949
5950 /**
5951  *      dev_new_index   -       allocate an ifindex
5952  *      @net: the applicable net namespace
5953  *
5954  *      Returns a suitable unique value for a new device interface
5955  *      number.  The caller must hold the rtnl semaphore or the
5956  *      dev_base_lock to be sure it remains unique.
5957  */
5958 static int dev_new_index(struct net *net)
5959 {
5960         int ifindex = net->ifindex;
5961         for (;;) {
5962                 if (++ifindex <= 0)
5963                         ifindex = 1;
5964                 if (!__dev_get_by_index(net, ifindex))
5965                         return net->ifindex = ifindex;
5966         }
5967 }
5968
5969 /* Delayed registration/unregisteration */
5970 static LIST_HEAD(net_todo_list);
5971 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5972
5973 static void net_set_todo(struct net_device *dev)
5974 {
5975         list_add_tail(&dev->todo_list, &net_todo_list);
5976         dev_net(dev)->dev_unreg_count++;
5977 }
5978
5979 static void rollback_registered_many(struct list_head *head)
5980 {
5981         struct net_device *dev, *tmp;
5982         LIST_HEAD(close_head);
5983
5984         BUG_ON(dev_boot_phase);
5985         ASSERT_RTNL();
5986
5987         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5988                 /* Some devices call without registering
5989                  * for initialization unwind. Remove those
5990                  * devices and proceed with the remaining.
5991                  */
5992                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5993                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5994                                  dev->name, dev);
5995
5996                         WARN_ON(1);
5997                         list_del(&dev->unreg_list);
5998                         continue;
5999                 }
6000                 dev->dismantle = true;
6001                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6002         }
6003
6004         /* If device is running, close it first. */
6005         list_for_each_entry(dev, head, unreg_list)
6006                 list_add_tail(&dev->close_list, &close_head);
6007         dev_close_many(&close_head, true);
6008
6009         list_for_each_entry(dev, head, unreg_list) {
6010                 /* And unlink it from device chain. */
6011                 unlist_netdevice(dev);
6012
6013                 dev->reg_state = NETREG_UNREGISTERING;
6014         }
6015
6016         synchronize_net();
6017
6018         list_for_each_entry(dev, head, unreg_list) {
6019                 struct sk_buff *skb = NULL;
6020
6021                 /* Shutdown queueing discipline. */
6022                 dev_shutdown(dev);
6023
6024
6025                 /* Notify protocols, that we are about to destroy
6026                    this device. They should clean all the things.
6027                 */
6028                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6029
6030                 if (!dev->rtnl_link_ops ||
6031                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6032                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6033                                                      GFP_KERNEL);
6034
6035                 /*
6036                  *      Flush the unicast and multicast chains
6037                  */
6038                 dev_uc_flush(dev);
6039                 dev_mc_flush(dev);
6040
6041                 if (dev->netdev_ops->ndo_uninit)
6042                         dev->netdev_ops->ndo_uninit(dev);
6043
6044                 if (skb)
6045                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6046
6047                 /* Notifier chain MUST detach us all upper devices. */
6048                 WARN_ON(netdev_has_any_upper_dev(dev));
6049
6050                 /* Remove entries from kobject tree */
6051                 netdev_unregister_kobject(dev);
6052 #ifdef CONFIG_XPS
6053                 /* Remove XPS queueing entries */
6054                 netif_reset_xps_queues_gt(dev, 0);
6055 #endif
6056         }
6057
6058         synchronize_net();
6059
6060         list_for_each_entry(dev, head, unreg_list)
6061                 dev_put(dev);
6062 }
6063
6064 static void rollback_registered(struct net_device *dev)
6065 {
6066         LIST_HEAD(single);
6067
6068         list_add(&dev->unreg_list, &single);
6069         rollback_registered_many(&single);
6070         list_del(&single);
6071 }
6072
6073 static netdev_features_t netdev_fix_features(struct net_device *dev,
6074         netdev_features_t features)
6075 {
6076         /* Fix illegal checksum combinations */
6077         if ((features & NETIF_F_HW_CSUM) &&
6078             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6079                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6080                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6081         }
6082
6083         /* TSO requires that SG is present as well. */
6084         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6085                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6086                 features &= ~NETIF_F_ALL_TSO;
6087         }
6088
6089         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6090                                         !(features & NETIF_F_IP_CSUM)) {
6091                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6092                 features &= ~NETIF_F_TSO;
6093                 features &= ~NETIF_F_TSO_ECN;
6094         }
6095
6096         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6097                                          !(features & NETIF_F_IPV6_CSUM)) {
6098                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6099                 features &= ~NETIF_F_TSO6;
6100         }
6101
6102         /* TSO ECN requires that TSO is present as well. */
6103         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6104                 features &= ~NETIF_F_TSO_ECN;
6105
6106         /* Software GSO depends on SG. */
6107         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6108                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6109                 features &= ~NETIF_F_GSO;
6110         }
6111
6112         /* UFO needs SG and checksumming */
6113         if (features & NETIF_F_UFO) {
6114                 /* maybe split UFO into V4 and V6? */
6115                 if (!((features & NETIF_F_GEN_CSUM) ||
6116                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6117                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6118                         netdev_dbg(dev,
6119                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6120                         features &= ~NETIF_F_UFO;
6121                 }
6122
6123                 if (!(features & NETIF_F_SG)) {
6124                         netdev_dbg(dev,
6125                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6126                         features &= ~NETIF_F_UFO;
6127                 }
6128         }
6129
6130 #ifdef CONFIG_NET_RX_BUSY_POLL
6131         if (dev->netdev_ops->ndo_busy_poll)
6132                 features |= NETIF_F_BUSY_POLL;
6133         else
6134 #endif
6135                 features &= ~NETIF_F_BUSY_POLL;
6136
6137         return features;
6138 }
6139
6140 int __netdev_update_features(struct net_device *dev)
6141 {
6142         netdev_features_t features;
6143         int err = 0;
6144
6145         ASSERT_RTNL();
6146
6147         features = netdev_get_wanted_features(dev);
6148
6149         if (dev->netdev_ops->ndo_fix_features)
6150                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6151
6152         /* driver might be less strict about feature dependencies */
6153         features = netdev_fix_features(dev, features);
6154
6155         if (dev->features == features)
6156                 return 0;
6157
6158         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6159                 &dev->features, &features);
6160
6161         if (dev->netdev_ops->ndo_set_features)
6162                 err = dev->netdev_ops->ndo_set_features(dev, features);
6163
6164         if (unlikely(err < 0)) {
6165                 netdev_err(dev,
6166                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6167                         err, &features, &dev->features);
6168                 return -1;
6169         }
6170
6171         if (!err)
6172                 dev->features = features;
6173
6174         return 1;
6175 }
6176
6177 /**
6178  *      netdev_update_features - recalculate device features
6179  *      @dev: the device to check
6180  *
6181  *      Recalculate dev->features set and send notifications if it
6182  *      has changed. Should be called after driver or hardware dependent
6183  *      conditions might have changed that influence the features.
6184  */
6185 void netdev_update_features(struct net_device *dev)
6186 {
6187         if (__netdev_update_features(dev))
6188                 netdev_features_change(dev);
6189 }
6190 EXPORT_SYMBOL(netdev_update_features);
6191
6192 /**
6193  *      netdev_change_features - recalculate device features
6194  *      @dev: the device to check
6195  *
6196  *      Recalculate dev->features set and send notifications even
6197  *      if they have not changed. Should be called instead of
6198  *      netdev_update_features() if also dev->vlan_features might
6199  *      have changed to allow the changes to be propagated to stacked
6200  *      VLAN devices.
6201  */
6202 void netdev_change_features(struct net_device *dev)
6203 {
6204         __netdev_update_features(dev);
6205         netdev_features_change(dev);
6206 }
6207 EXPORT_SYMBOL(netdev_change_features);
6208
6209 /**
6210  *      netif_stacked_transfer_operstate -      transfer operstate
6211  *      @rootdev: the root or lower level device to transfer state from
6212  *      @dev: the device to transfer operstate to
6213  *
6214  *      Transfer operational state from root to device. This is normally
6215  *      called when a stacking relationship exists between the root
6216  *      device and the device(a leaf device).
6217  */
6218 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6219                                         struct net_device *dev)
6220 {
6221         if (rootdev->operstate == IF_OPER_DORMANT)
6222                 netif_dormant_on(dev);
6223         else
6224                 netif_dormant_off(dev);
6225
6226         if (netif_carrier_ok(rootdev)) {
6227                 if (!netif_carrier_ok(dev))
6228                         netif_carrier_on(dev);
6229         } else {
6230                 if (netif_carrier_ok(dev))
6231                         netif_carrier_off(dev);
6232         }
6233 }
6234 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6235
6236 #ifdef CONFIG_SYSFS
6237 static int netif_alloc_rx_queues(struct net_device *dev)
6238 {
6239         unsigned int i, count = dev->num_rx_queues;
6240         struct netdev_rx_queue *rx;
6241         size_t sz = count * sizeof(*rx);
6242
6243         BUG_ON(count < 1);
6244
6245         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6246         if (!rx) {
6247                 rx = vzalloc(sz);
6248                 if (!rx)
6249                         return -ENOMEM;
6250         }
6251         dev->_rx = rx;
6252
6253         for (i = 0; i < count; i++)
6254                 rx[i].dev = dev;
6255         return 0;
6256 }
6257 #endif
6258
6259 static void netdev_init_one_queue(struct net_device *dev,
6260                                   struct netdev_queue *queue, void *_unused)
6261 {
6262         /* Initialize queue lock */
6263         spin_lock_init(&queue->_xmit_lock);
6264         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6265         queue->xmit_lock_owner = -1;
6266         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6267         queue->dev = dev;
6268 #ifdef CONFIG_BQL
6269         dql_init(&queue->dql, HZ);
6270 #endif
6271 }
6272
6273 static void netif_free_tx_queues(struct net_device *dev)
6274 {
6275         kvfree(dev->_tx);
6276 }
6277
6278 static int netif_alloc_netdev_queues(struct net_device *dev)
6279 {
6280         unsigned int count = dev->num_tx_queues;
6281         struct netdev_queue *tx;
6282         size_t sz = count * sizeof(*tx);
6283
6284         BUG_ON(count < 1 || count > 0xffff);
6285
6286         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6287         if (!tx) {
6288                 tx = vzalloc(sz);
6289                 if (!tx)
6290                         return -ENOMEM;
6291         }
6292         dev->_tx = tx;
6293
6294         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6295         spin_lock_init(&dev->tx_global_lock);
6296
6297         return 0;
6298 }
6299
6300 /**
6301  *      register_netdevice      - register a network device
6302  *      @dev: device to register
6303  *
6304  *      Take a completed network device structure and add it to the kernel
6305  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6306  *      chain. 0 is returned on success. A negative errno code is returned
6307  *      on a failure to set up the device, or if the name is a duplicate.
6308  *
6309  *      Callers must hold the rtnl semaphore. You may want
6310  *      register_netdev() instead of this.
6311  *
6312  *      BUGS:
6313  *      The locking appears insufficient to guarantee two parallel registers
6314  *      will not get the same name.
6315  */
6316
6317 int register_netdevice(struct net_device *dev)
6318 {
6319         int ret;
6320         struct net *net = dev_net(dev);
6321
6322         BUG_ON(dev_boot_phase);
6323         ASSERT_RTNL();
6324
6325         might_sleep();
6326
6327         /* When net_device's are persistent, this will be fatal. */
6328         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6329         BUG_ON(!net);
6330
6331         spin_lock_init(&dev->addr_list_lock);
6332         netdev_set_addr_lockdep_class(dev);
6333
6334         ret = dev_get_valid_name(net, dev, dev->name);
6335         if (ret < 0)
6336                 goto out;
6337
6338         /* Init, if this function is available */
6339         if (dev->netdev_ops->ndo_init) {
6340                 ret = dev->netdev_ops->ndo_init(dev);
6341                 if (ret) {
6342                         if (ret > 0)
6343                                 ret = -EIO;
6344                         goto out;
6345                 }
6346         }
6347
6348         if (((dev->hw_features | dev->features) &
6349              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6350             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6351              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6352                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6353                 ret = -EINVAL;
6354                 goto err_uninit;
6355         }
6356
6357         ret = -EBUSY;
6358         if (!dev->ifindex)
6359                 dev->ifindex = dev_new_index(net);
6360         else if (__dev_get_by_index(net, dev->ifindex))
6361                 goto err_uninit;
6362
6363         /* Transfer changeable features to wanted_features and enable
6364          * software offloads (GSO and GRO).
6365          */
6366         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6367         dev->features |= NETIF_F_SOFT_FEATURES;
6368         dev->wanted_features = dev->features & dev->hw_features;
6369
6370         if (!(dev->flags & IFF_LOOPBACK)) {
6371                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6372         }
6373
6374         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6375          */
6376         dev->vlan_features |= NETIF_F_HIGHDMA;
6377
6378         /* Make NETIF_F_SG inheritable to tunnel devices.
6379          */
6380         dev->hw_enc_features |= NETIF_F_SG;
6381
6382         /* Make NETIF_F_SG inheritable to MPLS.
6383          */
6384         dev->mpls_features |= NETIF_F_SG;
6385
6386         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6387         ret = notifier_to_errno(ret);
6388         if (ret)
6389                 goto err_uninit;
6390
6391         ret = netdev_register_kobject(dev);
6392         if (ret)
6393                 goto err_uninit;
6394         dev->reg_state = NETREG_REGISTERED;
6395
6396         __netdev_update_features(dev);
6397
6398         /*
6399          *      Default initial state at registry is that the
6400          *      device is present.
6401          */
6402
6403         set_bit(__LINK_STATE_PRESENT, &dev->state);
6404
6405         linkwatch_init_dev(dev);
6406
6407         dev_init_scheduler(dev);
6408         dev_hold(dev);
6409         list_netdevice(dev);
6410         add_device_randomness(dev->dev_addr, dev->addr_len);
6411
6412         /* If the device has permanent device address, driver should
6413          * set dev_addr and also addr_assign_type should be set to
6414          * NET_ADDR_PERM (default value).
6415          */
6416         if (dev->addr_assign_type == NET_ADDR_PERM)
6417                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6418
6419         /* Notify protocols, that a new device appeared. */
6420         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6421         ret = notifier_to_errno(ret);
6422         if (ret) {
6423                 rollback_registered(dev);
6424                 dev->reg_state = NETREG_UNREGISTERED;
6425         }
6426         /*
6427          *      Prevent userspace races by waiting until the network
6428          *      device is fully setup before sending notifications.
6429          */
6430         if (!dev->rtnl_link_ops ||
6431             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6432                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6433
6434 out:
6435         return ret;
6436
6437 err_uninit:
6438         if (dev->netdev_ops->ndo_uninit)
6439                 dev->netdev_ops->ndo_uninit(dev);
6440         goto out;
6441 }
6442 EXPORT_SYMBOL(register_netdevice);
6443
6444 /**
6445  *      init_dummy_netdev       - init a dummy network device for NAPI
6446  *      @dev: device to init
6447  *
6448  *      This takes a network device structure and initialize the minimum
6449  *      amount of fields so it can be used to schedule NAPI polls without
6450  *      registering a full blown interface. This is to be used by drivers
6451  *      that need to tie several hardware interfaces to a single NAPI
6452  *      poll scheduler due to HW limitations.
6453  */
6454 int init_dummy_netdev(struct net_device *dev)
6455 {
6456         /* Clear everything. Note we don't initialize spinlocks
6457          * are they aren't supposed to be taken by any of the
6458          * NAPI code and this dummy netdev is supposed to be
6459          * only ever used for NAPI polls
6460          */
6461         memset(dev, 0, sizeof(struct net_device));
6462
6463         /* make sure we BUG if trying to hit standard
6464          * register/unregister code path
6465          */
6466         dev->reg_state = NETREG_DUMMY;
6467
6468         /* NAPI wants this */
6469         INIT_LIST_HEAD(&dev->napi_list);
6470
6471         /* a dummy interface is started by default */
6472         set_bit(__LINK_STATE_PRESENT, &dev->state);
6473         set_bit(__LINK_STATE_START, &dev->state);
6474
6475         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6476          * because users of this 'device' dont need to change
6477          * its refcount.
6478          */
6479
6480         return 0;
6481 }
6482 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6483
6484
6485 /**
6486  *      register_netdev - register a network device
6487  *      @dev: device to register
6488  *
6489  *      Take a completed network device structure and add it to the kernel
6490  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6491  *      chain. 0 is returned on success. A negative errno code is returned
6492  *      on a failure to set up the device, or if the name is a duplicate.
6493  *
6494  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6495  *      and expands the device name if you passed a format string to
6496  *      alloc_netdev.
6497  */
6498 int register_netdev(struct net_device *dev)
6499 {
6500         int err;
6501
6502         rtnl_lock();
6503         err = register_netdevice(dev);
6504         rtnl_unlock();
6505         return err;
6506 }
6507 EXPORT_SYMBOL(register_netdev);
6508
6509 int netdev_refcnt_read(const struct net_device *dev)
6510 {
6511         int i, refcnt = 0;
6512
6513         for_each_possible_cpu(i)
6514                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6515         return refcnt;
6516 }
6517 EXPORT_SYMBOL(netdev_refcnt_read);
6518
6519 /**
6520  * netdev_wait_allrefs - wait until all references are gone.
6521  * @dev: target net_device
6522  *
6523  * This is called when unregistering network devices.
6524  *
6525  * Any protocol or device that holds a reference should register
6526  * for netdevice notification, and cleanup and put back the
6527  * reference if they receive an UNREGISTER event.
6528  * We can get stuck here if buggy protocols don't correctly
6529  * call dev_put.
6530  */
6531 static void netdev_wait_allrefs(struct net_device *dev)
6532 {
6533         unsigned long rebroadcast_time, warning_time;
6534         int refcnt;
6535
6536         linkwatch_forget_dev(dev);
6537
6538         rebroadcast_time = warning_time = jiffies;
6539         refcnt = netdev_refcnt_read(dev);
6540
6541         while (refcnt != 0) {
6542                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6543                         rtnl_lock();
6544
6545                         /* Rebroadcast unregister notification */
6546                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6547
6548                         __rtnl_unlock();
6549                         rcu_barrier();
6550                         rtnl_lock();
6551
6552                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6553                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6554                                      &dev->state)) {
6555                                 /* We must not have linkwatch events
6556                                  * pending on unregister. If this
6557                                  * happens, we simply run the queue
6558                                  * unscheduled, resulting in a noop
6559                                  * for this device.
6560                                  */
6561                                 linkwatch_run_queue();
6562                         }
6563
6564                         __rtnl_unlock();
6565
6566                         rebroadcast_time = jiffies;
6567                 }
6568
6569                 msleep(250);
6570
6571                 refcnt = netdev_refcnt_read(dev);
6572
6573                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6574                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6575                                  dev->name, refcnt);
6576                         warning_time = jiffies;
6577                 }
6578         }
6579 }
6580
6581 /* The sequence is:
6582  *
6583  *      rtnl_lock();
6584  *      ...
6585  *      register_netdevice(x1);
6586  *      register_netdevice(x2);
6587  *      ...
6588  *      unregister_netdevice(y1);
6589  *      unregister_netdevice(y2);
6590  *      ...
6591  *      rtnl_unlock();
6592  *      free_netdev(y1);
6593  *      free_netdev(y2);
6594  *
6595  * We are invoked by rtnl_unlock().
6596  * This allows us to deal with problems:
6597  * 1) We can delete sysfs objects which invoke hotplug
6598  *    without deadlocking with linkwatch via keventd.
6599  * 2) Since we run with the RTNL semaphore not held, we can sleep
6600  *    safely in order to wait for the netdev refcnt to drop to zero.
6601  *
6602  * We must not return until all unregister events added during
6603  * the interval the lock was held have been completed.
6604  */
6605 void netdev_run_todo(void)
6606 {
6607         struct list_head list;
6608
6609         /* Snapshot list, allow later requests */
6610         list_replace_init(&net_todo_list, &list);
6611
6612         __rtnl_unlock();
6613
6614
6615         /* Wait for rcu callbacks to finish before next phase */
6616         if (!list_empty(&list))
6617                 rcu_barrier();
6618
6619         while (!list_empty(&list)) {
6620                 struct net_device *dev
6621                         = list_first_entry(&list, struct net_device, todo_list);
6622                 list_del(&dev->todo_list);
6623
6624                 rtnl_lock();
6625                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6626                 __rtnl_unlock();
6627
6628                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6629                         pr_err("network todo '%s' but state %d\n",
6630                                dev->name, dev->reg_state);
6631                         dump_stack();
6632                         continue;
6633                 }
6634
6635                 dev->reg_state = NETREG_UNREGISTERED;
6636
6637                 on_each_cpu(flush_backlog, dev, 1);
6638
6639                 netdev_wait_allrefs(dev);
6640
6641                 /* paranoia */
6642                 BUG_ON(netdev_refcnt_read(dev));
6643                 BUG_ON(!list_empty(&dev->ptype_all));
6644                 BUG_ON(!list_empty(&dev->ptype_specific));
6645                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6646                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6647                 WARN_ON(dev->dn_ptr);
6648
6649                 if (dev->destructor)
6650                         dev->destructor(dev);
6651
6652                 /* Report a network device has been unregistered */
6653                 rtnl_lock();
6654                 dev_net(dev)->dev_unreg_count--;
6655                 __rtnl_unlock();
6656                 wake_up(&netdev_unregistering_wq);
6657
6658                 /* Free network device */
6659                 kobject_put(&dev->dev.kobj);
6660         }
6661 }
6662
6663 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6664  * fields in the same order, with only the type differing.
6665  */
6666 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6667                              const struct net_device_stats *netdev_stats)
6668 {
6669 #if BITS_PER_LONG == 64
6670         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6671         memcpy(stats64, netdev_stats, sizeof(*stats64));
6672 #else
6673         size_t i, n = sizeof(*stats64) / sizeof(u64);
6674         const unsigned long *src = (const unsigned long *)netdev_stats;
6675         u64 *dst = (u64 *)stats64;
6676
6677         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6678                      sizeof(*stats64) / sizeof(u64));
6679         for (i = 0; i < n; i++)
6680                 dst[i] = src[i];
6681 #endif
6682 }
6683 EXPORT_SYMBOL(netdev_stats_to_stats64);
6684
6685 /**
6686  *      dev_get_stats   - get network device statistics
6687  *      @dev: device to get statistics from
6688  *      @storage: place to store stats
6689  *
6690  *      Get network statistics from device. Return @storage.
6691  *      The device driver may provide its own method by setting
6692  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6693  *      otherwise the internal statistics structure is used.
6694  */
6695 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6696                                         struct rtnl_link_stats64 *storage)
6697 {
6698         const struct net_device_ops *ops = dev->netdev_ops;
6699
6700         if (ops->ndo_get_stats64) {
6701                 memset(storage, 0, sizeof(*storage));
6702                 ops->ndo_get_stats64(dev, storage);
6703         } else if (ops->ndo_get_stats) {
6704                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6705         } else {
6706                 netdev_stats_to_stats64(storage, &dev->stats);
6707         }
6708         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6709         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6710         return storage;
6711 }
6712 EXPORT_SYMBOL(dev_get_stats);
6713
6714 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6715 {
6716         struct netdev_queue *queue = dev_ingress_queue(dev);
6717
6718 #ifdef CONFIG_NET_CLS_ACT
6719         if (queue)
6720                 return queue;
6721         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6722         if (!queue)
6723                 return NULL;
6724         netdev_init_one_queue(dev, queue, NULL);
6725         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6726         queue->qdisc_sleeping = &noop_qdisc;
6727         rcu_assign_pointer(dev->ingress_queue, queue);
6728 #endif
6729         return queue;
6730 }
6731
6732 static const struct ethtool_ops default_ethtool_ops;
6733
6734 void netdev_set_default_ethtool_ops(struct net_device *dev,
6735                                     const struct ethtool_ops *ops)
6736 {
6737         if (dev->ethtool_ops == &default_ethtool_ops)
6738                 dev->ethtool_ops = ops;
6739 }
6740 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6741
6742 void netdev_freemem(struct net_device *dev)
6743 {
6744         char *addr = (char *)dev - dev->padded;
6745
6746         kvfree(addr);
6747 }
6748
6749 /**
6750  *      alloc_netdev_mqs - allocate network device
6751  *      @sizeof_priv:           size of private data to allocate space for
6752  *      @name:                  device name format string
6753  *      @name_assign_type:      origin of device name
6754  *      @setup:                 callback to initialize device
6755  *      @txqs:                  the number of TX subqueues to allocate
6756  *      @rxqs:                  the number of RX subqueues to allocate
6757  *
6758  *      Allocates a struct net_device with private data area for driver use
6759  *      and performs basic initialization.  Also allocates subqueue structs
6760  *      for each queue on the device.
6761  */
6762 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6763                 unsigned char name_assign_type,
6764                 void (*setup)(struct net_device *),
6765                 unsigned int txqs, unsigned int rxqs)
6766 {
6767         struct net_device *dev;
6768         size_t alloc_size;
6769         struct net_device *p;
6770
6771         BUG_ON(strlen(name) >= sizeof(dev->name));
6772
6773         if (txqs < 1) {
6774                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6775                 return NULL;
6776         }
6777
6778 #ifdef CONFIG_SYSFS
6779         if (rxqs < 1) {
6780                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6781                 return NULL;
6782         }
6783 #endif
6784
6785         alloc_size = sizeof(struct net_device);
6786         if (sizeof_priv) {
6787                 /* ensure 32-byte alignment of private area */
6788                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6789                 alloc_size += sizeof_priv;
6790         }
6791         /* ensure 32-byte alignment of whole construct */
6792         alloc_size += NETDEV_ALIGN - 1;
6793
6794         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6795         if (!p)
6796                 p = vzalloc(alloc_size);
6797         if (!p)
6798                 return NULL;
6799
6800         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6801         dev->padded = (char *)dev - (char *)p;
6802
6803         dev->pcpu_refcnt = alloc_percpu(int);
6804         if (!dev->pcpu_refcnt)
6805                 goto free_dev;
6806
6807         if (dev_addr_init(dev))
6808                 goto free_pcpu;
6809
6810         dev_mc_init(dev);
6811         dev_uc_init(dev);
6812
6813         dev_net_set(dev, &init_net);
6814
6815         dev->gso_max_size = GSO_MAX_SIZE;
6816         dev->gso_max_segs = GSO_MAX_SEGS;
6817         dev->gso_min_segs = 0;
6818
6819         INIT_LIST_HEAD(&dev->napi_list);
6820         INIT_LIST_HEAD(&dev->unreg_list);
6821         INIT_LIST_HEAD(&dev->close_list);
6822         INIT_LIST_HEAD(&dev->link_watch_list);
6823         INIT_LIST_HEAD(&dev->adj_list.upper);
6824         INIT_LIST_HEAD(&dev->adj_list.lower);
6825         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6826         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6827         INIT_LIST_HEAD(&dev->ptype_all);
6828         INIT_LIST_HEAD(&dev->ptype_specific);
6829         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6830         setup(dev);
6831
6832         dev->num_tx_queues = txqs;
6833         dev->real_num_tx_queues = txqs;
6834         if (netif_alloc_netdev_queues(dev))
6835                 goto free_all;
6836
6837 #ifdef CONFIG_SYSFS
6838         dev->num_rx_queues = rxqs;
6839         dev->real_num_rx_queues = rxqs;
6840         if (netif_alloc_rx_queues(dev))
6841                 goto free_all;
6842 #endif
6843
6844         strcpy(dev->name, name);
6845         dev->name_assign_type = name_assign_type;
6846         dev->group = INIT_NETDEV_GROUP;
6847         if (!dev->ethtool_ops)
6848                 dev->ethtool_ops = &default_ethtool_ops;
6849         return dev;
6850
6851 free_all:
6852         free_netdev(dev);
6853         return NULL;
6854
6855 free_pcpu:
6856         free_percpu(dev->pcpu_refcnt);
6857 free_dev:
6858         netdev_freemem(dev);
6859         return NULL;
6860 }
6861 EXPORT_SYMBOL(alloc_netdev_mqs);
6862
6863 /**
6864  *      free_netdev - free network device
6865  *      @dev: device
6866  *
6867  *      This function does the last stage of destroying an allocated device
6868  *      interface. The reference to the device object is released.
6869  *      If this is the last reference then it will be freed.
6870  */
6871 void free_netdev(struct net_device *dev)
6872 {
6873         struct napi_struct *p, *n;
6874
6875         netif_free_tx_queues(dev);
6876 #ifdef CONFIG_SYSFS
6877         kvfree(dev->_rx);
6878 #endif
6879
6880         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6881
6882         /* Flush device addresses */
6883         dev_addr_flush(dev);
6884
6885         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6886                 netif_napi_del(p);
6887
6888         free_percpu(dev->pcpu_refcnt);
6889         dev->pcpu_refcnt = NULL;
6890
6891         /*  Compatibility with error handling in drivers */
6892         if (dev->reg_state == NETREG_UNINITIALIZED) {
6893                 netdev_freemem(dev);
6894                 return;
6895         }
6896
6897         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6898         dev->reg_state = NETREG_RELEASED;
6899
6900         /* will free via device release */
6901         put_device(&dev->dev);
6902 }
6903 EXPORT_SYMBOL(free_netdev);
6904
6905 /**
6906  *      synchronize_net -  Synchronize with packet receive processing
6907  *
6908  *      Wait for packets currently being received to be done.
6909  *      Does not block later packets from starting.
6910  */
6911 void synchronize_net(void)
6912 {
6913         might_sleep();
6914         if (rtnl_is_locked())
6915                 synchronize_rcu_expedited();
6916         else
6917                 synchronize_rcu();
6918 }
6919 EXPORT_SYMBOL(synchronize_net);
6920
6921 /**
6922  *      unregister_netdevice_queue - remove device from the kernel
6923  *      @dev: device
6924  *      @head: list
6925  *
6926  *      This function shuts down a device interface and removes it
6927  *      from the kernel tables.
6928  *      If head not NULL, device is queued to be unregistered later.
6929  *
6930  *      Callers must hold the rtnl semaphore.  You may want
6931  *      unregister_netdev() instead of this.
6932  */
6933
6934 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6935 {
6936         ASSERT_RTNL();
6937
6938         if (head) {
6939                 list_move_tail(&dev->unreg_list, head);
6940         } else {
6941                 rollback_registered(dev);
6942                 /* Finish processing unregister after unlock */
6943                 net_set_todo(dev);
6944         }
6945 }
6946 EXPORT_SYMBOL(unregister_netdevice_queue);
6947
6948 /**
6949  *      unregister_netdevice_many - unregister many devices
6950  *      @head: list of devices
6951  *
6952  *  Note: As most callers use a stack allocated list_head,
6953  *  we force a list_del() to make sure stack wont be corrupted later.
6954  */
6955 void unregister_netdevice_many(struct list_head *head)
6956 {
6957         struct net_device *dev;
6958
6959         if (!list_empty(head)) {
6960                 rollback_registered_many(head);
6961                 list_for_each_entry(dev, head, unreg_list)
6962                         net_set_todo(dev);
6963                 list_del(head);
6964         }
6965 }
6966 EXPORT_SYMBOL(unregister_netdevice_many);
6967
6968 /**
6969  *      unregister_netdev - remove device from the kernel
6970  *      @dev: device
6971  *
6972  *      This function shuts down a device interface and removes it
6973  *      from the kernel tables.
6974  *
6975  *      This is just a wrapper for unregister_netdevice that takes
6976  *      the rtnl semaphore.  In general you want to use this and not
6977  *      unregister_netdevice.
6978  */
6979 void unregister_netdev(struct net_device *dev)
6980 {
6981         rtnl_lock();
6982         unregister_netdevice(dev);
6983         rtnl_unlock();
6984 }
6985 EXPORT_SYMBOL(unregister_netdev);
6986
6987 /**
6988  *      dev_change_net_namespace - move device to different nethost namespace
6989  *      @dev: device
6990  *      @net: network namespace
6991  *      @pat: If not NULL name pattern to try if the current device name
6992  *            is already taken in the destination network namespace.
6993  *
6994  *      This function shuts down a device interface and moves it
6995  *      to a new network namespace. On success 0 is returned, on
6996  *      a failure a netagive errno code is returned.
6997  *
6998  *      Callers must hold the rtnl semaphore.
6999  */
7000
7001 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7002 {
7003         int err;
7004
7005         ASSERT_RTNL();
7006
7007         /* Don't allow namespace local devices to be moved. */
7008         err = -EINVAL;
7009         if (dev->features & NETIF_F_NETNS_LOCAL)
7010                 goto out;
7011
7012         /* Ensure the device has been registrered */
7013         if (dev->reg_state != NETREG_REGISTERED)
7014                 goto out;
7015
7016         /* Get out if there is nothing todo */
7017         err = 0;
7018         if (net_eq(dev_net(dev), net))
7019                 goto out;
7020
7021         /* Pick the destination device name, and ensure
7022          * we can use it in the destination network namespace.
7023          */
7024         err = -EEXIST;
7025         if (__dev_get_by_name(net, dev->name)) {
7026                 /* We get here if we can't use the current device name */
7027                 if (!pat)
7028                         goto out;
7029                 if (dev_get_valid_name(net, dev, pat) < 0)
7030                         goto out;
7031         }
7032
7033         /*
7034          * And now a mini version of register_netdevice unregister_netdevice.
7035          */
7036
7037         /* If device is running close it first. */
7038         dev_close(dev);
7039
7040         /* And unlink it from device chain */
7041         err = -ENODEV;
7042         unlist_netdevice(dev);
7043
7044         synchronize_net();
7045
7046         /* Shutdown queueing discipline. */
7047         dev_shutdown(dev);
7048
7049         /* Notify protocols, that we are about to destroy
7050            this device. They should clean all the things.
7051
7052            Note that dev->reg_state stays at NETREG_REGISTERED.
7053            This is wanted because this way 8021q and macvlan know
7054            the device is just moving and can keep their slaves up.
7055         */
7056         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7057         rcu_barrier();
7058         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7059         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7060
7061         /*
7062          *      Flush the unicast and multicast chains
7063          */
7064         dev_uc_flush(dev);
7065         dev_mc_flush(dev);
7066
7067         /* Send a netdev-removed uevent to the old namespace */
7068         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7069         netdev_adjacent_del_links(dev);
7070
7071         /* Actually switch the network namespace */
7072         dev_net_set(dev, net);
7073
7074         /* If there is an ifindex conflict assign a new one */
7075         if (__dev_get_by_index(net, dev->ifindex))
7076                 dev->ifindex = dev_new_index(net);
7077
7078         /* Send a netdev-add uevent to the new namespace */
7079         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7080         netdev_adjacent_add_links(dev);
7081
7082         /* Fixup kobjects */
7083         err = device_rename(&dev->dev, dev->name);
7084         WARN_ON(err);
7085
7086         /* Add the device back in the hashes */
7087         list_netdevice(dev);
7088
7089         /* Notify protocols, that a new device appeared. */
7090         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7091
7092         /*
7093          *      Prevent userspace races by waiting until the network
7094          *      device is fully setup before sending notifications.
7095          */
7096         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7097
7098         synchronize_net();
7099         err = 0;
7100 out:
7101         return err;
7102 }
7103 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7104
7105 static int dev_cpu_callback(struct notifier_block *nfb,
7106                             unsigned long action,
7107                             void *ocpu)
7108 {
7109         struct sk_buff **list_skb;
7110         struct sk_buff *skb;
7111         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7112         struct softnet_data *sd, *oldsd;
7113
7114         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7115                 return NOTIFY_OK;
7116
7117         local_irq_disable();
7118         cpu = smp_processor_id();
7119         sd = &per_cpu(softnet_data, cpu);
7120         oldsd = &per_cpu(softnet_data, oldcpu);
7121
7122         /* Find end of our completion_queue. */
7123         list_skb = &sd->completion_queue;
7124         while (*list_skb)
7125                 list_skb = &(*list_skb)->next;
7126         /* Append completion queue from offline CPU. */
7127         *list_skb = oldsd->completion_queue;
7128         oldsd->completion_queue = NULL;
7129
7130         /* Append output queue from offline CPU. */
7131         if (oldsd->output_queue) {
7132                 *sd->output_queue_tailp = oldsd->output_queue;
7133                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7134                 oldsd->output_queue = NULL;
7135                 oldsd->output_queue_tailp = &oldsd->output_queue;
7136         }
7137         /* Append NAPI poll list from offline CPU, with one exception :
7138          * process_backlog() must be called by cpu owning percpu backlog.
7139          * We properly handle process_queue & input_pkt_queue later.
7140          */
7141         while (!list_empty(&oldsd->poll_list)) {
7142                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7143                                                             struct napi_struct,
7144                                                             poll_list);
7145
7146                 list_del_init(&napi->poll_list);
7147                 if (napi->poll == process_backlog)
7148                         napi->state = 0;
7149                 else
7150                         ____napi_schedule(sd, napi);
7151         }
7152
7153         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7154         local_irq_enable();
7155
7156         /* Process offline CPU's input_pkt_queue */
7157         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7158                 netif_rx_ni(skb);
7159                 input_queue_head_incr(oldsd);
7160         }
7161         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7162                 netif_rx_ni(skb);
7163                 input_queue_head_incr(oldsd);
7164         }
7165
7166         return NOTIFY_OK;
7167 }
7168
7169
7170 /**
7171  *      netdev_increment_features - increment feature set by one
7172  *      @all: current feature set
7173  *      @one: new feature set
7174  *      @mask: mask feature set
7175  *
7176  *      Computes a new feature set after adding a device with feature set
7177  *      @one to the master device with current feature set @all.  Will not
7178  *      enable anything that is off in @mask. Returns the new feature set.
7179  */
7180 netdev_features_t netdev_increment_features(netdev_features_t all,
7181         netdev_features_t one, netdev_features_t mask)
7182 {
7183         if (mask & NETIF_F_GEN_CSUM)
7184                 mask |= NETIF_F_ALL_CSUM;
7185         mask |= NETIF_F_VLAN_CHALLENGED;
7186
7187         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7188         all &= one | ~NETIF_F_ALL_FOR_ALL;
7189
7190         /* If one device supports hw checksumming, set for all. */
7191         if (all & NETIF_F_GEN_CSUM)
7192                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7193
7194         return all;
7195 }
7196 EXPORT_SYMBOL(netdev_increment_features);
7197
7198 static struct hlist_head * __net_init netdev_create_hash(void)
7199 {
7200         int i;
7201         struct hlist_head *hash;
7202
7203         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7204         if (hash != NULL)
7205                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7206                         INIT_HLIST_HEAD(&hash[i]);
7207
7208         return hash;
7209 }
7210
7211 /* Initialize per network namespace state */
7212 static int __net_init netdev_init(struct net *net)
7213 {
7214         if (net != &init_net)
7215                 INIT_LIST_HEAD(&net->dev_base_head);
7216
7217         net->dev_name_head = netdev_create_hash();
7218         if (net->dev_name_head == NULL)
7219                 goto err_name;
7220
7221         net->dev_index_head = netdev_create_hash();
7222         if (net->dev_index_head == NULL)
7223                 goto err_idx;
7224
7225         return 0;
7226
7227 err_idx:
7228         kfree(net->dev_name_head);
7229 err_name:
7230         return -ENOMEM;
7231 }
7232
7233 /**
7234  *      netdev_drivername - network driver for the device
7235  *      @dev: network device
7236  *
7237  *      Determine network driver for device.
7238  */
7239 const char *netdev_drivername(const struct net_device *dev)
7240 {
7241         const struct device_driver *driver;
7242         const struct device *parent;
7243         const char *empty = "";
7244
7245         parent = dev->dev.parent;
7246         if (!parent)
7247                 return empty;
7248
7249         driver = parent->driver;
7250         if (driver && driver->name)
7251                 return driver->name;
7252         return empty;
7253 }
7254
7255 static void __netdev_printk(const char *level, const struct net_device *dev,
7256                             struct va_format *vaf)
7257 {
7258         if (dev && dev->dev.parent) {
7259                 dev_printk_emit(level[1] - '0',
7260                                 dev->dev.parent,
7261                                 "%s %s %s%s: %pV",
7262                                 dev_driver_string(dev->dev.parent),
7263                                 dev_name(dev->dev.parent),
7264                                 netdev_name(dev), netdev_reg_state(dev),
7265                                 vaf);
7266         } else if (dev) {
7267                 printk("%s%s%s: %pV",
7268                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7269         } else {
7270                 printk("%s(NULL net_device): %pV", level, vaf);
7271         }
7272 }
7273
7274 void netdev_printk(const char *level, const struct net_device *dev,
7275                    const char *format, ...)
7276 {
7277         struct va_format vaf;
7278         va_list args;
7279
7280         va_start(args, format);
7281
7282         vaf.fmt = format;
7283         vaf.va = &args;
7284
7285         __netdev_printk(level, dev, &vaf);
7286
7287         va_end(args);
7288 }
7289 EXPORT_SYMBOL(netdev_printk);
7290
7291 #define define_netdev_printk_level(func, level)                 \
7292 void func(const struct net_device *dev, const char *fmt, ...)   \
7293 {                                                               \
7294         struct va_format vaf;                                   \
7295         va_list args;                                           \
7296                                                                 \
7297         va_start(args, fmt);                                    \
7298                                                                 \
7299         vaf.fmt = fmt;                                          \
7300         vaf.va = &args;                                         \
7301                                                                 \
7302         __netdev_printk(level, dev, &vaf);                      \
7303                                                                 \
7304         va_end(args);                                           \
7305 }                                                               \
7306 EXPORT_SYMBOL(func);
7307
7308 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7309 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7310 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7311 define_netdev_printk_level(netdev_err, KERN_ERR);
7312 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7313 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7314 define_netdev_printk_level(netdev_info, KERN_INFO);
7315
7316 static void __net_exit netdev_exit(struct net *net)
7317 {
7318         kfree(net->dev_name_head);
7319         kfree(net->dev_index_head);
7320 }
7321
7322 static struct pernet_operations __net_initdata netdev_net_ops = {
7323         .init = netdev_init,
7324         .exit = netdev_exit,
7325 };
7326
7327 static void __net_exit default_device_exit(struct net *net)
7328 {
7329         struct net_device *dev, *aux;
7330         /*
7331          * Push all migratable network devices back to the
7332          * initial network namespace
7333          */
7334         rtnl_lock();
7335         for_each_netdev_safe(net, dev, aux) {
7336                 int err;
7337                 char fb_name[IFNAMSIZ];
7338
7339                 /* Ignore unmoveable devices (i.e. loopback) */
7340                 if (dev->features & NETIF_F_NETNS_LOCAL)
7341                         continue;
7342
7343                 /* Leave virtual devices for the generic cleanup */
7344                 if (dev->rtnl_link_ops)
7345                         continue;
7346
7347                 /* Push remaining network devices to init_net */
7348                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7349                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7350                 if (err) {
7351                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7352                                  __func__, dev->name, err);
7353                         BUG();
7354                 }
7355         }
7356         rtnl_unlock();
7357 }
7358
7359 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7360 {
7361         /* Return with the rtnl_lock held when there are no network
7362          * devices unregistering in any network namespace in net_list.
7363          */
7364         struct net *net;
7365         bool unregistering;
7366         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7367
7368         add_wait_queue(&netdev_unregistering_wq, &wait);
7369         for (;;) {
7370                 unregistering = false;
7371                 rtnl_lock();
7372                 list_for_each_entry(net, net_list, exit_list) {
7373                         if (net->dev_unreg_count > 0) {
7374                                 unregistering = true;
7375                                 break;
7376                         }
7377                 }
7378                 if (!unregistering)
7379                         break;
7380                 __rtnl_unlock();
7381
7382                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7383         }
7384         remove_wait_queue(&netdev_unregistering_wq, &wait);
7385 }
7386
7387 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7388 {
7389         /* At exit all network devices most be removed from a network
7390          * namespace.  Do this in the reverse order of registration.
7391          * Do this across as many network namespaces as possible to
7392          * improve batching efficiency.
7393          */
7394         struct net_device *dev;
7395         struct net *net;
7396         LIST_HEAD(dev_kill_list);
7397
7398         /* To prevent network device cleanup code from dereferencing
7399          * loopback devices or network devices that have been freed
7400          * wait here for all pending unregistrations to complete,
7401          * before unregistring the loopback device and allowing the
7402          * network namespace be freed.
7403          *
7404          * The netdev todo list containing all network devices
7405          * unregistrations that happen in default_device_exit_batch
7406          * will run in the rtnl_unlock() at the end of
7407          * default_device_exit_batch.
7408          */
7409         rtnl_lock_unregistering(net_list);
7410         list_for_each_entry(net, net_list, exit_list) {
7411                 for_each_netdev_reverse(net, dev) {
7412                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7413                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7414                         else
7415                                 unregister_netdevice_queue(dev, &dev_kill_list);
7416                 }
7417         }
7418         unregister_netdevice_many(&dev_kill_list);
7419         rtnl_unlock();
7420 }
7421
7422 static struct pernet_operations __net_initdata default_device_ops = {
7423         .exit = default_device_exit,
7424         .exit_batch = default_device_exit_batch,
7425 };
7426
7427 /*
7428  *      Initialize the DEV module. At boot time this walks the device list and
7429  *      unhooks any devices that fail to initialise (normally hardware not
7430  *      present) and leaves us with a valid list of present and active devices.
7431  *
7432  */
7433
7434 /*
7435  *       This is called single threaded during boot, so no need
7436  *       to take the rtnl semaphore.
7437  */
7438 static int __init net_dev_init(void)
7439 {
7440         int i, rc = -ENOMEM;
7441
7442         BUG_ON(!dev_boot_phase);
7443
7444         if (dev_proc_init())
7445                 goto out;
7446
7447         if (netdev_kobject_init())
7448                 goto out;
7449
7450         INIT_LIST_HEAD(&ptype_all);
7451         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7452                 INIT_LIST_HEAD(&ptype_base[i]);
7453
7454         INIT_LIST_HEAD(&offload_base);
7455
7456         if (register_pernet_subsys(&netdev_net_ops))
7457                 goto out;
7458
7459         /*
7460          *      Initialise the packet receive queues.
7461          */
7462
7463         for_each_possible_cpu(i) {
7464                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7465
7466                 skb_queue_head_init(&sd->input_pkt_queue);
7467                 skb_queue_head_init(&sd->process_queue);
7468                 INIT_LIST_HEAD(&sd->poll_list);
7469                 sd->output_queue_tailp = &sd->output_queue;
7470 #ifdef CONFIG_RPS
7471                 sd->csd.func = rps_trigger_softirq;
7472                 sd->csd.info = sd;
7473                 sd->cpu = i;
7474 #endif
7475
7476                 sd->backlog.poll = process_backlog;
7477                 sd->backlog.weight = weight_p;
7478         }
7479
7480         dev_boot_phase = 0;
7481
7482         /* The loopback device is special if any other network devices
7483          * is present in a network namespace the loopback device must
7484          * be present. Since we now dynamically allocate and free the
7485          * loopback device ensure this invariant is maintained by
7486          * keeping the loopback device as the first device on the
7487          * list of network devices.  Ensuring the loopback devices
7488          * is the first device that appears and the last network device
7489          * that disappears.
7490          */
7491         if (register_pernet_device(&loopback_net_ops))
7492                 goto out;
7493
7494         if (register_pernet_device(&default_device_ops))
7495                 goto out;
7496
7497         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7498         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7499
7500         hotcpu_notifier(dev_cpu_callback, 0);
7501         dst_init();
7502         rc = 0;
7503 out:
7504         return rc;
7505 }
7506
7507 subsys_initcall(net_dev_init);