net-timestamp: add key to disambiguate concurrent datagrams
[deliverable/linux.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
1da177e4 76#include <linux/bitops.h>
4fc268d2 77#include <linux/capability.h>
1da177e4
LT
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
08e9897d 81#include <linux/hash.h>
5a0e3ad6 82#include <linux/slab.h>
1da177e4 83#include <linux/sched.h>
4a3e2f71 84#include <linux/mutex.h>
1da177e4
LT
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
0187bdfb 94#include <linux/ethtool.h>
1da177e4
LT
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
1da177e4 100#include <linux/stat.h>
1da177e4
LT
101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
44540960 104#include <net/xfrm.h>
1da177e4
LT
105#include <linux/highmem.h>
106#include <linux/init.h>
1da177e4 107#include <linux/module.h>
1da177e4
LT
108#include <linux/netpoll.h>
109#include <linux/rcupdate.h>
110#include <linux/delay.h>
1da177e4 111#include <net/iw_handler.h>
1da177e4 112#include <asm/current.h>
5bdb9886 113#include <linux/audit.h>
db217334 114#include <linux/dmaengine.h>
f6a78bfc 115#include <linux/err.h>
c7fa9d18 116#include <linux/ctype.h>
723e98b7 117#include <linux/if_arp.h>
6de329e2 118#include <linux/if_vlan.h>
8f0f2223 119#include <linux/ip.h>
ad55dcaf 120#include <net/ip.h>
8f0f2223
DM
121#include <linux/ipv6.h>
122#include <linux/in.h>
b6b2fed1
DM
123#include <linux/jhash.h>
124#include <linux/random.h>
9cbc1cb8 125#include <trace/events/napi.h>
cf66ba58 126#include <trace/events/net.h>
07dc22e7 127#include <trace/events/skb.h>
5acbbd42 128#include <linux/pci.h>
caeda9b9 129#include <linux/inetdevice.h>
c445477d 130#include <linux/cpu_rmap.h>
c5905afb 131#include <linux/static_key.h>
af12fa6e 132#include <linux/hashtable.h>
60877a32 133#include <linux/vmalloc.h>
529d0489 134#include <linux/if_macvlan.h>
1da177e4 135
342709ef
PE
136#include "net-sysfs.h"
137
d565b0a1
HX
138/* Instead of increasing this, you should create a hash table. */
139#define MAX_GRO_SKBS 8
140
5d38a079
HX
141/* This should be increased if a protocol with a bigger head is added. */
142#define GRO_MAX_HEAD (MAX_HEADER + 128)
143
1da177e4 144static DEFINE_SPINLOCK(ptype_lock);
62532da9 145static DEFINE_SPINLOCK(offload_lock);
900ff8c6
CW
146struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
147struct list_head ptype_all __read_mostly; /* Taps */
62532da9 148static struct list_head offload_base __read_mostly;
1da177e4 149
ae78dbfa 150static int netif_rx_internal(struct sk_buff *skb);
54951194
LP
151static int call_netdevice_notifiers_info(unsigned long val,
152 struct net_device *dev,
153 struct netdev_notifier_info *info);
ae78dbfa 154
1da177e4 155/*
7562f876 156 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
157 * semaphore.
158 *
c6d14c84 159 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
160 *
161 * Writers must hold the rtnl semaphore while they loop through the
7562f876 162 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
163 * actual updates. This allows pure readers to access the list even
164 * while a writer is preparing to update it.
165 *
166 * To put it another way, dev_base_lock is held for writing only to
167 * protect against pure readers; the rtnl semaphore provides the
168 * protection against other writers.
169 *
170 * See, for example usages, register_netdevice() and
171 * unregister_netdevice(), which must be called with the rtnl
172 * semaphore held.
173 */
1da177e4 174DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
175EXPORT_SYMBOL(dev_base_lock);
176
af12fa6e
ET
177/* protects napi_hash addition/deletion and napi_gen_id */
178static DEFINE_SPINLOCK(napi_hash_lock);
179
180static unsigned int napi_gen_id;
181static DEFINE_HASHTABLE(napi_hash, 8);
182
18afa4b0 183static seqcount_t devnet_rename_seq;
c91f6df2 184
4e985ada
TG
185static inline void dev_base_seq_inc(struct net *net)
186{
187 while (++net->dev_base_seq == 0);
188}
189
881d966b 190static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4 191{
95c96174
ED
192 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
193
08e9897d 194 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
195}
196
881d966b 197static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 198{
7c28bd0b 199 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
200}
201
e36fa2f7 202static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
203{
204#ifdef CONFIG_RPS
e36fa2f7 205 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
206#endif
207}
208
e36fa2f7 209static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
210{
211#ifdef CONFIG_RPS
e36fa2f7 212 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
213#endif
214}
215
ce286d32 216/* Device list insertion */
53759be9 217static void list_netdevice(struct net_device *dev)
ce286d32 218{
c346dca1 219 struct net *net = dev_net(dev);
ce286d32
EB
220
221 ASSERT_RTNL();
222
223 write_lock_bh(&dev_base_lock);
c6d14c84 224 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 225 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
226 hlist_add_head_rcu(&dev->index_hlist,
227 dev_index_hash(net, dev->ifindex));
ce286d32 228 write_unlock_bh(&dev_base_lock);
4e985ada
TG
229
230 dev_base_seq_inc(net);
ce286d32
EB
231}
232
fb699dfd
ED
233/* Device list removal
234 * caller must respect a RCU grace period before freeing/reusing dev
235 */
ce286d32
EB
236static void unlist_netdevice(struct net_device *dev)
237{
238 ASSERT_RTNL();
239
240 /* Unlink dev from the device chain */
241 write_lock_bh(&dev_base_lock);
c6d14c84 242 list_del_rcu(&dev->dev_list);
72c9528b 243 hlist_del_rcu(&dev->name_hlist);
fb699dfd 244 hlist_del_rcu(&dev->index_hlist);
ce286d32 245 write_unlock_bh(&dev_base_lock);
4e985ada
TG
246
247 dev_base_seq_inc(dev_net(dev));
ce286d32
EB
248}
249
1da177e4
LT
250/*
251 * Our notifier list
252 */
253
f07d5b94 254static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
255
256/*
257 * Device drivers call our routines to queue packets here. We empty the
258 * queue in the local softnet handler.
259 */
bea3348e 260
9958da05 261DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 262EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 263
cf508b12 264#ifdef CONFIG_LOCKDEP
723e98b7 265/*
c773e847 266 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
267 * according to dev->type
268 */
269static const unsigned short netdev_lock_type[] =
270 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
271 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
272 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
273 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
274 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
275 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
276 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
277 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
278 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
279 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
280 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
281 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
211ed865
PG
282 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
283 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
284 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
723e98b7 285
36cbd3dc 286static const char *const netdev_lock_name[] =
723e98b7
JP
287 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
288 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
289 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
290 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
291 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
292 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
293 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
294 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
295 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
296 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
297 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
298 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
211ed865
PG
299 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
300 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
301 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
302
303static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 304static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
305
306static inline unsigned short netdev_lock_pos(unsigned short dev_type)
307{
308 int i;
309
310 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
311 if (netdev_lock_type[i] == dev_type)
312 return i;
313 /* the last key is used by default */
314 return ARRAY_SIZE(netdev_lock_type) - 1;
315}
316
cf508b12
DM
317static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
318 unsigned short dev_type)
723e98b7
JP
319{
320 int i;
321
322 i = netdev_lock_pos(dev_type);
323 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
324 netdev_lock_name[i]);
325}
cf508b12
DM
326
327static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
328{
329 int i;
330
331 i = netdev_lock_pos(dev->type);
332 lockdep_set_class_and_name(&dev->addr_list_lock,
333 &netdev_addr_lock_key[i],
334 netdev_lock_name[i]);
335}
723e98b7 336#else
cf508b12
DM
337static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
338 unsigned short dev_type)
339{
340}
341static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
342{
343}
344#endif
1da177e4
LT
345
346/*******************************************************************************
347
348 Protocol management and registration routines
349
350*******************************************************************************/
351
1da177e4
LT
352/*
353 * Add a protocol ID to the list. Now that the input handler is
354 * smarter we can dispense with all the messy stuff that used to be
355 * here.
356 *
357 * BEWARE!!! Protocol handlers, mangling input packets,
358 * MUST BE last in hash buckets and checking protocol handlers
359 * MUST start from promiscuous ptype_all chain in net_bh.
360 * It is true now, do not change it.
361 * Explanation follows: if protocol handler, mangling packet, will
362 * be the first on list, it is not able to sense, that packet
363 * is cloned and should be copied-on-write, so that it will
364 * change it and subsequent readers will get broken packet.
365 * --ANK (980803)
366 */
367
c07b68e8
ED
368static inline struct list_head *ptype_head(const struct packet_type *pt)
369{
370 if (pt->type == htons(ETH_P_ALL))
371 return &ptype_all;
372 else
373 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
374}
375
1da177e4
LT
376/**
377 * dev_add_pack - add packet handler
378 * @pt: packet type declaration
379 *
380 * Add a protocol handler to the networking stack. The passed &packet_type
381 * is linked into kernel lists and may not be freed until it has been
382 * removed from the kernel lists.
383 *
4ec93edb 384 * This call does not sleep therefore it can not
1da177e4
LT
385 * guarantee all CPU's that are in middle of receiving packets
386 * will see the new packet type (until the next received packet).
387 */
388
389void dev_add_pack(struct packet_type *pt)
390{
c07b68e8 391 struct list_head *head = ptype_head(pt);
1da177e4 392
c07b68e8
ED
393 spin_lock(&ptype_lock);
394 list_add_rcu(&pt->list, head);
395 spin_unlock(&ptype_lock);
1da177e4 396}
d1b19dff 397EXPORT_SYMBOL(dev_add_pack);
1da177e4 398
1da177e4
LT
399/**
400 * __dev_remove_pack - remove packet handler
401 * @pt: packet type declaration
402 *
403 * Remove a protocol handler that was previously added to the kernel
404 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
405 * from the kernel lists and can be freed or reused once this function
4ec93edb 406 * returns.
1da177e4
LT
407 *
408 * The packet type might still be in use by receivers
409 * and must not be freed until after all the CPU's have gone
410 * through a quiescent state.
411 */
412void __dev_remove_pack(struct packet_type *pt)
413{
c07b68e8 414 struct list_head *head = ptype_head(pt);
1da177e4
LT
415 struct packet_type *pt1;
416
c07b68e8 417 spin_lock(&ptype_lock);
1da177e4
LT
418
419 list_for_each_entry(pt1, head, list) {
420 if (pt == pt1) {
421 list_del_rcu(&pt->list);
422 goto out;
423 }
424 }
425
7b6cd1ce 426 pr_warn("dev_remove_pack: %p not found\n", pt);
1da177e4 427out:
c07b68e8 428 spin_unlock(&ptype_lock);
1da177e4 429}
d1b19dff
ED
430EXPORT_SYMBOL(__dev_remove_pack);
431
1da177e4
LT
432/**
433 * dev_remove_pack - remove packet handler
434 * @pt: packet type declaration
435 *
436 * Remove a protocol handler that was previously added to the kernel
437 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
438 * from the kernel lists and can be freed or reused once this function
439 * returns.
440 *
441 * This call sleeps to guarantee that no CPU is looking at the packet
442 * type after return.
443 */
444void dev_remove_pack(struct packet_type *pt)
445{
446 __dev_remove_pack(pt);
4ec93edb 447
1da177e4
LT
448 synchronize_net();
449}
d1b19dff 450EXPORT_SYMBOL(dev_remove_pack);
1da177e4 451
62532da9
VY
452
453/**
454 * dev_add_offload - register offload handlers
455 * @po: protocol offload declaration
456 *
457 * Add protocol offload handlers to the networking stack. The passed
458 * &proto_offload is linked into kernel lists and may not be freed until
459 * it has been removed from the kernel lists.
460 *
461 * This call does not sleep therefore it can not
462 * guarantee all CPU's that are in middle of receiving packets
463 * will see the new offload handlers (until the next received packet).
464 */
465void dev_add_offload(struct packet_offload *po)
466{
467 struct list_head *head = &offload_base;
468
469 spin_lock(&offload_lock);
470 list_add_rcu(&po->list, head);
471 spin_unlock(&offload_lock);
472}
473EXPORT_SYMBOL(dev_add_offload);
474
475/**
476 * __dev_remove_offload - remove offload handler
477 * @po: packet offload declaration
478 *
479 * Remove a protocol offload handler that was previously added to the
480 * kernel offload handlers by dev_add_offload(). The passed &offload_type
481 * is removed from the kernel lists and can be freed or reused once this
482 * function returns.
483 *
484 * The packet type might still be in use by receivers
485 * and must not be freed until after all the CPU's have gone
486 * through a quiescent state.
487 */
1d143d9f 488static void __dev_remove_offload(struct packet_offload *po)
62532da9
VY
489{
490 struct list_head *head = &offload_base;
491 struct packet_offload *po1;
492
c53aa505 493 spin_lock(&offload_lock);
62532da9
VY
494
495 list_for_each_entry(po1, head, list) {
496 if (po == po1) {
497 list_del_rcu(&po->list);
498 goto out;
499 }
500 }
501
502 pr_warn("dev_remove_offload: %p not found\n", po);
503out:
c53aa505 504 spin_unlock(&offload_lock);
62532da9 505}
62532da9
VY
506
507/**
508 * dev_remove_offload - remove packet offload handler
509 * @po: packet offload declaration
510 *
511 * Remove a packet offload handler that was previously added to the kernel
512 * offload handlers by dev_add_offload(). The passed &offload_type is
513 * removed from the kernel lists and can be freed or reused once this
514 * function returns.
515 *
516 * This call sleeps to guarantee that no CPU is looking at the packet
517 * type after return.
518 */
519void dev_remove_offload(struct packet_offload *po)
520{
521 __dev_remove_offload(po);
522
523 synchronize_net();
524}
525EXPORT_SYMBOL(dev_remove_offload);
526
1da177e4
LT
527/******************************************************************************
528
529 Device Boot-time Settings Routines
530
531*******************************************************************************/
532
533/* Boot time configuration table */
534static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
535
536/**
537 * netdev_boot_setup_add - add new setup entry
538 * @name: name of the device
539 * @map: configured settings for the device
540 *
541 * Adds new setup entry to the dev_boot_setup list. The function
542 * returns 0 on error and 1 on success. This is a generic routine to
543 * all netdevices.
544 */
545static int netdev_boot_setup_add(char *name, struct ifmap *map)
546{
547 struct netdev_boot_setup *s;
548 int i;
549
550 s = dev_boot_setup;
551 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
552 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
553 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 554 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
555 memcpy(&s[i].map, map, sizeof(s[i].map));
556 break;
557 }
558 }
559
560 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
561}
562
563/**
564 * netdev_boot_setup_check - check boot time settings
565 * @dev: the netdevice
566 *
567 * Check boot time settings for the device.
568 * The found settings are set for the device to be used
569 * later in the device probing.
570 * Returns 0 if no settings found, 1 if they are.
571 */
572int netdev_boot_setup_check(struct net_device *dev)
573{
574 struct netdev_boot_setup *s = dev_boot_setup;
575 int i;
576
577 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
578 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 579 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
580 dev->irq = s[i].map.irq;
581 dev->base_addr = s[i].map.base_addr;
582 dev->mem_start = s[i].map.mem_start;
583 dev->mem_end = s[i].map.mem_end;
584 return 1;
585 }
586 }
587 return 0;
588}
d1b19dff 589EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
590
591
592/**
593 * netdev_boot_base - get address from boot time settings
594 * @prefix: prefix for network device
595 * @unit: id for network device
596 *
597 * Check boot time settings for the base address of device.
598 * The found settings are set for the device to be used
599 * later in the device probing.
600 * Returns 0 if no settings found.
601 */
602unsigned long netdev_boot_base(const char *prefix, int unit)
603{
604 const struct netdev_boot_setup *s = dev_boot_setup;
605 char name[IFNAMSIZ];
606 int i;
607
608 sprintf(name, "%s%d", prefix, unit);
609
610 /*
611 * If device already registered then return base of 1
612 * to indicate not to probe for this interface
613 */
881d966b 614 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
615 return 1;
616
617 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
618 if (!strcmp(name, s[i].name))
619 return s[i].map.base_addr;
620 return 0;
621}
622
623/*
624 * Saves at boot time configured settings for any netdevice.
625 */
626int __init netdev_boot_setup(char *str)
627{
628 int ints[5];
629 struct ifmap map;
630
631 str = get_options(str, ARRAY_SIZE(ints), ints);
632 if (!str || !*str)
633 return 0;
634
635 /* Save settings */
636 memset(&map, 0, sizeof(map));
637 if (ints[0] > 0)
638 map.irq = ints[1];
639 if (ints[0] > 1)
640 map.base_addr = ints[2];
641 if (ints[0] > 2)
642 map.mem_start = ints[3];
643 if (ints[0] > 3)
644 map.mem_end = ints[4];
645
646 /* Add new entry to the list */
647 return netdev_boot_setup_add(str, &map);
648}
649
650__setup("netdev=", netdev_boot_setup);
651
652/*******************************************************************************
653
654 Device Interface Subroutines
655
656*******************************************************************************/
657
658/**
659 * __dev_get_by_name - find a device by its name
c4ea43c5 660 * @net: the applicable net namespace
1da177e4
LT
661 * @name: name to find
662 *
663 * Find an interface by name. Must be called under RTNL semaphore
664 * or @dev_base_lock. If the name is found a pointer to the device
665 * is returned. If the name is not found then %NULL is returned. The
666 * reference counters are not incremented so the caller must be
667 * careful with locks.
668 */
669
881d966b 670struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4 671{
0bd8d536
ED
672 struct net_device *dev;
673 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 674
b67bfe0d 675 hlist_for_each_entry(dev, head, name_hlist)
1da177e4
LT
676 if (!strncmp(dev->name, name, IFNAMSIZ))
677 return dev;
0bd8d536 678
1da177e4
LT
679 return NULL;
680}
d1b19dff 681EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 682
72c9528b
ED
683/**
684 * dev_get_by_name_rcu - find a device by its name
685 * @net: the applicable net namespace
686 * @name: name to find
687 *
688 * Find an interface by name.
689 * If the name is found a pointer to the device is returned.
690 * If the name is not found then %NULL is returned.
691 * The reference counters are not incremented so the caller must be
692 * careful with locks. The caller must hold RCU lock.
693 */
694
695struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
696{
72c9528b
ED
697 struct net_device *dev;
698 struct hlist_head *head = dev_name_hash(net, name);
699
b67bfe0d 700 hlist_for_each_entry_rcu(dev, head, name_hlist)
72c9528b
ED
701 if (!strncmp(dev->name, name, IFNAMSIZ))
702 return dev;
703
704 return NULL;
705}
706EXPORT_SYMBOL(dev_get_by_name_rcu);
707
1da177e4
LT
708/**
709 * dev_get_by_name - find a device by its name
c4ea43c5 710 * @net: the applicable net namespace
1da177e4
LT
711 * @name: name to find
712 *
713 * Find an interface by name. This can be called from any
714 * context and does its own locking. The returned handle has
715 * the usage count incremented and the caller must use dev_put() to
716 * release it when it is no longer needed. %NULL is returned if no
717 * matching device is found.
718 */
719
881d966b 720struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
721{
722 struct net_device *dev;
723
72c9528b
ED
724 rcu_read_lock();
725 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
726 if (dev)
727 dev_hold(dev);
72c9528b 728 rcu_read_unlock();
1da177e4
LT
729 return dev;
730}
d1b19dff 731EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
732
733/**
734 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 735 * @net: the applicable net namespace
1da177e4
LT
736 * @ifindex: index of device
737 *
738 * Search for an interface by index. Returns %NULL if the device
739 * is not found or a pointer to the device. The device has not
740 * had its reference counter increased so the caller must be careful
741 * about locking. The caller must hold either the RTNL semaphore
742 * or @dev_base_lock.
743 */
744
881d966b 745struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4 746{
0bd8d536
ED
747 struct net_device *dev;
748 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 749
b67bfe0d 750 hlist_for_each_entry(dev, head, index_hlist)
1da177e4
LT
751 if (dev->ifindex == ifindex)
752 return dev;
0bd8d536 753
1da177e4
LT
754 return NULL;
755}
d1b19dff 756EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 757
fb699dfd
ED
758/**
759 * dev_get_by_index_rcu - find a device by its ifindex
760 * @net: the applicable net namespace
761 * @ifindex: index of device
762 *
763 * Search for an interface by index. Returns %NULL if the device
764 * is not found or a pointer to the device. The device has not
765 * had its reference counter increased so the caller must be careful
766 * about locking. The caller must hold RCU lock.
767 */
768
769struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
770{
fb699dfd
ED
771 struct net_device *dev;
772 struct hlist_head *head = dev_index_hash(net, ifindex);
773
b67bfe0d 774 hlist_for_each_entry_rcu(dev, head, index_hlist)
fb699dfd
ED
775 if (dev->ifindex == ifindex)
776 return dev;
777
778 return NULL;
779}
780EXPORT_SYMBOL(dev_get_by_index_rcu);
781
1da177e4
LT
782
783/**
784 * dev_get_by_index - find a device by its ifindex
c4ea43c5 785 * @net: the applicable net namespace
1da177e4
LT
786 * @ifindex: index of device
787 *
788 * Search for an interface by index. Returns NULL if the device
789 * is not found or a pointer to the device. The device returned has
790 * had a reference added and the pointer is safe until the user calls
791 * dev_put to indicate they have finished with it.
792 */
793
881d966b 794struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
795{
796 struct net_device *dev;
797
fb699dfd
ED
798 rcu_read_lock();
799 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
800 if (dev)
801 dev_hold(dev);
fb699dfd 802 rcu_read_unlock();
1da177e4
LT
803 return dev;
804}
d1b19dff 805EXPORT_SYMBOL(dev_get_by_index);
1da177e4 806
5dbe7c17
NS
807/**
808 * netdev_get_name - get a netdevice name, knowing its ifindex.
809 * @net: network namespace
810 * @name: a pointer to the buffer where the name will be stored.
811 * @ifindex: the ifindex of the interface to get the name from.
812 *
813 * The use of raw_seqcount_begin() and cond_resched() before
814 * retrying is required as we want to give the writers a chance
815 * to complete when CONFIG_PREEMPT is not set.
816 */
817int netdev_get_name(struct net *net, char *name, int ifindex)
818{
819 struct net_device *dev;
820 unsigned int seq;
821
822retry:
823 seq = raw_seqcount_begin(&devnet_rename_seq);
824 rcu_read_lock();
825 dev = dev_get_by_index_rcu(net, ifindex);
826 if (!dev) {
827 rcu_read_unlock();
828 return -ENODEV;
829 }
830
831 strcpy(name, dev->name);
832 rcu_read_unlock();
833 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
834 cond_resched();
835 goto retry;
836 }
837
838 return 0;
839}
840
1da177e4 841/**
941666c2 842 * dev_getbyhwaddr_rcu - find a device by its hardware address
c4ea43c5 843 * @net: the applicable net namespace
1da177e4
LT
844 * @type: media type of device
845 * @ha: hardware address
846 *
847 * Search for an interface by MAC address. Returns NULL if the device
c506653d
ED
848 * is not found or a pointer to the device.
849 * The caller must hold RCU or RTNL.
941666c2 850 * The returned device has not had its ref count increased
1da177e4
LT
851 * and the caller must therefore be careful about locking
852 *
1da177e4
LT
853 */
854
941666c2
ED
855struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
856 const char *ha)
1da177e4
LT
857{
858 struct net_device *dev;
859
941666c2 860 for_each_netdev_rcu(net, dev)
1da177e4
LT
861 if (dev->type == type &&
862 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
863 return dev;
864
865 return NULL;
1da177e4 866}
941666c2 867EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
cf309e3f 868
881d966b 869struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
870{
871 struct net_device *dev;
872
4e9cac2b 873 ASSERT_RTNL();
881d966b 874 for_each_netdev(net, dev)
4e9cac2b 875 if (dev->type == type)
7562f876
PE
876 return dev;
877
878 return NULL;
4e9cac2b 879}
4e9cac2b
PM
880EXPORT_SYMBOL(__dev_getfirstbyhwtype);
881
881d966b 882struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 883{
99fe3c39 884 struct net_device *dev, *ret = NULL;
4e9cac2b 885
99fe3c39
ED
886 rcu_read_lock();
887 for_each_netdev_rcu(net, dev)
888 if (dev->type == type) {
889 dev_hold(dev);
890 ret = dev;
891 break;
892 }
893 rcu_read_unlock();
894 return ret;
1da177e4 895}
1da177e4
LT
896EXPORT_SYMBOL(dev_getfirstbyhwtype);
897
898/**
bb69ae04 899 * dev_get_by_flags_rcu - find any device with given flags
c4ea43c5 900 * @net: the applicable net namespace
1da177e4
LT
901 * @if_flags: IFF_* values
902 * @mask: bitmask of bits in if_flags to check
903 *
904 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04
ED
905 * is not found or a pointer to the device. Must be called inside
906 * rcu_read_lock(), and result refcount is unchanged.
1da177e4
LT
907 */
908
bb69ae04 909struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
d1b19dff 910 unsigned short mask)
1da177e4 911{
7562f876 912 struct net_device *dev, *ret;
1da177e4 913
7562f876 914 ret = NULL;
c6d14c84 915 for_each_netdev_rcu(net, dev) {
1da177e4 916 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 917 ret = dev;
1da177e4
LT
918 break;
919 }
920 }
7562f876 921 return ret;
1da177e4 922}
bb69ae04 923EXPORT_SYMBOL(dev_get_by_flags_rcu);
1da177e4
LT
924
925/**
926 * dev_valid_name - check if name is okay for network device
927 * @name: name string
928 *
929 * Network device names need to be valid file names to
c7fa9d18
DM
930 * to allow sysfs to work. We also disallow any kind of
931 * whitespace.
1da177e4 932 */
95f050bf 933bool dev_valid_name(const char *name)
1da177e4 934{
c7fa9d18 935 if (*name == '\0')
95f050bf 936 return false;
b6fe17d6 937 if (strlen(name) >= IFNAMSIZ)
95f050bf 938 return false;
c7fa9d18 939 if (!strcmp(name, ".") || !strcmp(name, ".."))
95f050bf 940 return false;
c7fa9d18
DM
941
942 while (*name) {
943 if (*name == '/' || isspace(*name))
95f050bf 944 return false;
c7fa9d18
DM
945 name++;
946 }
95f050bf 947 return true;
1da177e4 948}
d1b19dff 949EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
950
951/**
b267b179
EB
952 * __dev_alloc_name - allocate a name for a device
953 * @net: network namespace to allocate the device name in
1da177e4 954 * @name: name format string
b267b179 955 * @buf: scratch buffer and result name string
1da177e4
LT
956 *
957 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
958 * id. It scans list of devices to build up a free map, then chooses
959 * the first empty slot. The caller must hold the dev_base or rtnl lock
960 * while allocating the name and adding the device in order to avoid
961 * duplicates.
962 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
963 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
964 */
965
b267b179 966static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
967{
968 int i = 0;
1da177e4
LT
969 const char *p;
970 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 971 unsigned long *inuse;
1da177e4
LT
972 struct net_device *d;
973
974 p = strnchr(name, IFNAMSIZ-1, '%');
975 if (p) {
976 /*
977 * Verify the string as this thing may have come from
978 * the user. There must be either one "%d" and no other "%"
979 * characters.
980 */
981 if (p[1] != 'd' || strchr(p + 2, '%'))
982 return -EINVAL;
983
984 /* Use one page as a bit array of possible slots */
cfcabdcc 985 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
986 if (!inuse)
987 return -ENOMEM;
988
881d966b 989 for_each_netdev(net, d) {
1da177e4
LT
990 if (!sscanf(d->name, name, &i))
991 continue;
992 if (i < 0 || i >= max_netdevices)
993 continue;
994
995 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 996 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
997 if (!strncmp(buf, d->name, IFNAMSIZ))
998 set_bit(i, inuse);
999 }
1000
1001 i = find_first_zero_bit(inuse, max_netdevices);
1002 free_page((unsigned long) inuse);
1003 }
1004
d9031024
OP
1005 if (buf != name)
1006 snprintf(buf, IFNAMSIZ, name, i);
b267b179 1007 if (!__dev_get_by_name(net, buf))
1da177e4 1008 return i;
1da177e4
LT
1009
1010 /* It is possible to run out of possible slots
1011 * when the name is long and there isn't enough space left
1012 * for the digits, or if all bits are used.
1013 */
1014 return -ENFILE;
1015}
1016
b267b179
EB
1017/**
1018 * dev_alloc_name - allocate a name for a device
1019 * @dev: device
1020 * @name: name format string
1021 *
1022 * Passed a format string - eg "lt%d" it will try and find a suitable
1023 * id. It scans list of devices to build up a free map, then chooses
1024 * the first empty slot. The caller must hold the dev_base or rtnl lock
1025 * while allocating the name and adding the device in order to avoid
1026 * duplicates.
1027 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1028 * Returns the number of the unit assigned or a negative errno code.
1029 */
1030
1031int dev_alloc_name(struct net_device *dev, const char *name)
1032{
1033 char buf[IFNAMSIZ];
1034 struct net *net;
1035 int ret;
1036
c346dca1
YH
1037 BUG_ON(!dev_net(dev));
1038 net = dev_net(dev);
b267b179
EB
1039 ret = __dev_alloc_name(net, name, buf);
1040 if (ret >= 0)
1041 strlcpy(dev->name, buf, IFNAMSIZ);
1042 return ret;
1043}
d1b19dff 1044EXPORT_SYMBOL(dev_alloc_name);
b267b179 1045
828de4f6
G
1046static int dev_alloc_name_ns(struct net *net,
1047 struct net_device *dev,
1048 const char *name)
d9031024 1049{
828de4f6
G
1050 char buf[IFNAMSIZ];
1051 int ret;
8ce6cebc 1052
828de4f6
G
1053 ret = __dev_alloc_name(net, name, buf);
1054 if (ret >= 0)
1055 strlcpy(dev->name, buf, IFNAMSIZ);
1056 return ret;
1057}
1058
1059static int dev_get_valid_name(struct net *net,
1060 struct net_device *dev,
1061 const char *name)
1062{
1063 BUG_ON(!net);
8ce6cebc 1064
d9031024
OP
1065 if (!dev_valid_name(name))
1066 return -EINVAL;
1067
1c5cae81 1068 if (strchr(name, '%'))
828de4f6 1069 return dev_alloc_name_ns(net, dev, name);
d9031024
OP
1070 else if (__dev_get_by_name(net, name))
1071 return -EEXIST;
8ce6cebc
DL
1072 else if (dev->name != name)
1073 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
1074
1075 return 0;
1076}
1da177e4
LT
1077
1078/**
1079 * dev_change_name - change name of a device
1080 * @dev: device
1081 * @newname: name (or format string) must be at least IFNAMSIZ
1082 *
1083 * Change name of a device, can pass format strings "eth%d".
1084 * for wildcarding.
1085 */
cf04a4c7 1086int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 1087{
238fa362 1088 unsigned char old_assign_type;
fcc5a03a 1089 char oldname[IFNAMSIZ];
1da177e4 1090 int err = 0;
fcc5a03a 1091 int ret;
881d966b 1092 struct net *net;
1da177e4
LT
1093
1094 ASSERT_RTNL();
c346dca1 1095 BUG_ON(!dev_net(dev));
1da177e4 1096
c346dca1 1097 net = dev_net(dev);
1da177e4
LT
1098 if (dev->flags & IFF_UP)
1099 return -EBUSY;
1100
30e6c9fa 1101 write_seqcount_begin(&devnet_rename_seq);
c91f6df2
BH
1102
1103 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
30e6c9fa 1104 write_seqcount_end(&devnet_rename_seq);
c8d90dca 1105 return 0;
c91f6df2 1106 }
c8d90dca 1107
fcc5a03a
HX
1108 memcpy(oldname, dev->name, IFNAMSIZ);
1109
828de4f6 1110 err = dev_get_valid_name(net, dev, newname);
c91f6df2 1111 if (err < 0) {
30e6c9fa 1112 write_seqcount_end(&devnet_rename_seq);
d9031024 1113 return err;
c91f6df2 1114 }
1da177e4 1115
6fe82a39
VF
1116 if (oldname[0] && !strchr(oldname, '%'))
1117 netdev_info(dev, "renamed from %s\n", oldname);
1118
238fa362
TG
1119 old_assign_type = dev->name_assign_type;
1120 dev->name_assign_type = NET_NAME_RENAMED;
1121
fcc5a03a 1122rollback:
a1b3f594
EB
1123 ret = device_rename(&dev->dev, dev->name);
1124 if (ret) {
1125 memcpy(dev->name, oldname, IFNAMSIZ);
238fa362 1126 dev->name_assign_type = old_assign_type;
30e6c9fa 1127 write_seqcount_end(&devnet_rename_seq);
a1b3f594 1128 return ret;
dcc99773 1129 }
7f988eab 1130
30e6c9fa 1131 write_seqcount_end(&devnet_rename_seq);
c91f6df2 1132
5bb025fa
VF
1133 netdev_adjacent_rename_links(dev, oldname);
1134
7f988eab 1135 write_lock_bh(&dev_base_lock);
372b2312 1136 hlist_del_rcu(&dev->name_hlist);
72c9528b
ED
1137 write_unlock_bh(&dev_base_lock);
1138
1139 synchronize_rcu();
1140
1141 write_lock_bh(&dev_base_lock);
1142 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1143 write_unlock_bh(&dev_base_lock);
1144
056925ab 1145 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1146 ret = notifier_to_errno(ret);
1147
1148 if (ret) {
91e9c07b
ED
1149 /* err >= 0 after dev_alloc_name() or stores the first errno */
1150 if (err >= 0) {
fcc5a03a 1151 err = ret;
30e6c9fa 1152 write_seqcount_begin(&devnet_rename_seq);
fcc5a03a 1153 memcpy(dev->name, oldname, IFNAMSIZ);
5bb025fa 1154 memcpy(oldname, newname, IFNAMSIZ);
238fa362
TG
1155 dev->name_assign_type = old_assign_type;
1156 old_assign_type = NET_NAME_RENAMED;
fcc5a03a 1157 goto rollback;
91e9c07b 1158 } else {
7b6cd1ce 1159 pr_err("%s: name change rollback failed: %d\n",
91e9c07b 1160 dev->name, ret);
fcc5a03a
HX
1161 }
1162 }
1da177e4
LT
1163
1164 return err;
1165}
1166
0b815a1a
SH
1167/**
1168 * dev_set_alias - change ifalias of a device
1169 * @dev: device
1170 * @alias: name up to IFALIASZ
f0db275a 1171 * @len: limit of bytes to copy from info
0b815a1a
SH
1172 *
1173 * Set ifalias for a device,
1174 */
1175int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1176{
7364e445
AK
1177 char *new_ifalias;
1178
0b815a1a
SH
1179 ASSERT_RTNL();
1180
1181 if (len >= IFALIASZ)
1182 return -EINVAL;
1183
96ca4a2c 1184 if (!len) {
388dfc2d
SK
1185 kfree(dev->ifalias);
1186 dev->ifalias = NULL;
96ca4a2c
OH
1187 return 0;
1188 }
1189
7364e445
AK
1190 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1191 if (!new_ifalias)
0b815a1a 1192 return -ENOMEM;
7364e445 1193 dev->ifalias = new_ifalias;
0b815a1a
SH
1194
1195 strlcpy(dev->ifalias, alias, len+1);
1196 return len;
1197}
1198
1199
d8a33ac4 1200/**
3041a069 1201 * netdev_features_change - device changes features
d8a33ac4
SH
1202 * @dev: device to cause notification
1203 *
1204 * Called to indicate a device has changed features.
1205 */
1206void netdev_features_change(struct net_device *dev)
1207{
056925ab 1208 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1209}
1210EXPORT_SYMBOL(netdev_features_change);
1211
1da177e4
LT
1212/**
1213 * netdev_state_change - device changes state
1214 * @dev: device to cause notification
1215 *
1216 * Called to indicate a device has changed state. This function calls
1217 * the notifier chains for netdev_chain and sends a NEWLINK message
1218 * to the routing socket.
1219 */
1220void netdev_state_change(struct net_device *dev)
1221{
1222 if (dev->flags & IFF_UP) {
54951194
LP
1223 struct netdev_notifier_change_info change_info;
1224
1225 change_info.flags_changed = 0;
1226 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1227 &change_info.info);
7f294054 1228 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1da177e4
LT
1229 }
1230}
d1b19dff 1231EXPORT_SYMBOL(netdev_state_change);
1da177e4 1232
ee89bab1
AW
1233/**
1234 * netdev_notify_peers - notify network peers about existence of @dev
1235 * @dev: network device
1236 *
1237 * Generate traffic such that interested network peers are aware of
1238 * @dev, such as by generating a gratuitous ARP. This may be used when
1239 * a device wants to inform the rest of the network about some sort of
1240 * reconfiguration such as a failover event or virtual machine
1241 * migration.
1242 */
1243void netdev_notify_peers(struct net_device *dev)
c1da4ac7 1244{
ee89bab1
AW
1245 rtnl_lock();
1246 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1247 rtnl_unlock();
c1da4ac7 1248}
ee89bab1 1249EXPORT_SYMBOL(netdev_notify_peers);
c1da4ac7 1250
bd380811 1251static int __dev_open(struct net_device *dev)
1da177e4 1252{
d314774c 1253 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1254 int ret;
1da177e4 1255
e46b66bc
BH
1256 ASSERT_RTNL();
1257
1da177e4
LT
1258 if (!netif_device_present(dev))
1259 return -ENODEV;
1260
ca99ca14
NH
1261 /* Block netpoll from trying to do any rx path servicing.
1262 * If we don't do this there is a chance ndo_poll_controller
1263 * or ndo_poll may be running while we open the device
1264 */
66b5552f 1265 netpoll_poll_disable(dev);
ca99ca14 1266
3b8bcfd5
JB
1267 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1268 ret = notifier_to_errno(ret);
1269 if (ret)
1270 return ret;
1271
1da177e4 1272 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1273
d314774c
SH
1274 if (ops->ndo_validate_addr)
1275 ret = ops->ndo_validate_addr(dev);
bada339b 1276
d314774c
SH
1277 if (!ret && ops->ndo_open)
1278 ret = ops->ndo_open(dev);
1da177e4 1279
66b5552f 1280 netpoll_poll_enable(dev);
ca99ca14 1281
bada339b
JG
1282 if (ret)
1283 clear_bit(__LINK_STATE_START, &dev->state);
1284 else {
1da177e4 1285 dev->flags |= IFF_UP;
b4bd07c2 1286 net_dmaengine_get();
4417da66 1287 dev_set_rx_mode(dev);
1da177e4 1288 dev_activate(dev);
7bf23575 1289 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 1290 }
bada339b 1291
1da177e4
LT
1292 return ret;
1293}
1294
1295/**
bd380811
PM
1296 * dev_open - prepare an interface for use.
1297 * @dev: device to open
1da177e4 1298 *
bd380811
PM
1299 * Takes a device from down to up state. The device's private open
1300 * function is invoked and then the multicast lists are loaded. Finally
1301 * the device is moved into the up state and a %NETDEV_UP message is
1302 * sent to the netdev notifier chain.
1303 *
1304 * Calling this function on an active interface is a nop. On a failure
1305 * a negative errno code is returned.
1da177e4 1306 */
bd380811
PM
1307int dev_open(struct net_device *dev)
1308{
1309 int ret;
1310
bd380811
PM
1311 if (dev->flags & IFF_UP)
1312 return 0;
1313
bd380811
PM
1314 ret = __dev_open(dev);
1315 if (ret < 0)
1316 return ret;
1317
7f294054 1318 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
bd380811
PM
1319 call_netdevice_notifiers(NETDEV_UP, dev);
1320
1321 return ret;
1322}
1323EXPORT_SYMBOL(dev_open);
1324
44345724 1325static int __dev_close_many(struct list_head *head)
1da177e4 1326{
44345724 1327 struct net_device *dev;
e46b66bc 1328
bd380811 1329 ASSERT_RTNL();
9d5010db
DM
1330 might_sleep();
1331
5cde2829 1332 list_for_each_entry(dev, head, close_list) {
3f4df206 1333 /* Temporarily disable netpoll until the interface is down */
66b5552f 1334 netpoll_poll_disable(dev);
3f4df206 1335
44345724 1336 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1337
44345724 1338 clear_bit(__LINK_STATE_START, &dev->state);
1da177e4 1339
44345724
OP
1340 /* Synchronize to scheduled poll. We cannot touch poll list, it
1341 * can be even on different cpu. So just clear netif_running().
1342 *
1343 * dev->stop() will invoke napi_disable() on all of it's
1344 * napi_struct instances on this device.
1345 */
4e857c58 1346 smp_mb__after_atomic(); /* Commit netif_running(). */
44345724 1347 }
1da177e4 1348
44345724 1349 dev_deactivate_many(head);
d8b2a4d2 1350
5cde2829 1351 list_for_each_entry(dev, head, close_list) {
44345724 1352 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4 1353
44345724
OP
1354 /*
1355 * Call the device specific close. This cannot fail.
1356 * Only if device is UP
1357 *
1358 * We allow it to be called even after a DETACH hot-plug
1359 * event.
1360 */
1361 if (ops->ndo_stop)
1362 ops->ndo_stop(dev);
1363
44345724 1364 dev->flags &= ~IFF_UP;
44345724 1365 net_dmaengine_put();
66b5552f 1366 netpoll_poll_enable(dev);
44345724
OP
1367 }
1368
1369 return 0;
1370}
1371
1372static int __dev_close(struct net_device *dev)
1373{
f87e6f47 1374 int retval;
44345724
OP
1375 LIST_HEAD(single);
1376
5cde2829 1377 list_add(&dev->close_list, &single);
f87e6f47
LT
1378 retval = __dev_close_many(&single);
1379 list_del(&single);
ca99ca14 1380
f87e6f47 1381 return retval;
44345724
OP
1382}
1383
3fbd8758 1384static int dev_close_many(struct list_head *head)
44345724
OP
1385{
1386 struct net_device *dev, *tmp;
1da177e4 1387
5cde2829
EB
1388 /* Remove the devices that don't need to be closed */
1389 list_for_each_entry_safe(dev, tmp, head, close_list)
44345724 1390 if (!(dev->flags & IFF_UP))
5cde2829 1391 list_del_init(&dev->close_list);
44345724
OP
1392
1393 __dev_close_many(head);
1da177e4 1394
5cde2829 1395 list_for_each_entry_safe(dev, tmp, head, close_list) {
7f294054 1396 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
44345724 1397 call_netdevice_notifiers(NETDEV_DOWN, dev);
5cde2829 1398 list_del_init(&dev->close_list);
44345724 1399 }
bd380811
PM
1400
1401 return 0;
1402}
1403
1404/**
1405 * dev_close - shutdown an interface.
1406 * @dev: device to shutdown
1407 *
1408 * This function moves an active device into down state. A
1409 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1410 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1411 * chain.
1412 */
1413int dev_close(struct net_device *dev)
1414{
e14a5993
ED
1415 if (dev->flags & IFF_UP) {
1416 LIST_HEAD(single);
1da177e4 1417
5cde2829 1418 list_add(&dev->close_list, &single);
e14a5993
ED
1419 dev_close_many(&single);
1420 list_del(&single);
1421 }
da6e378b 1422 return 0;
1da177e4 1423}
d1b19dff 1424EXPORT_SYMBOL(dev_close);
1da177e4
LT
1425
1426
0187bdfb
BH
1427/**
1428 * dev_disable_lro - disable Large Receive Offload on a device
1429 * @dev: device
1430 *
1431 * Disable Large Receive Offload (LRO) on a net device. Must be
1432 * called under RTNL. This is needed if received packets may be
1433 * forwarded to another interface.
1434 */
1435void dev_disable_lro(struct net_device *dev)
1436{
f11970e3
NH
1437 /*
1438 * If we're trying to disable lro on a vlan device
1439 * use the underlying physical device instead
1440 */
1441 if (is_vlan_dev(dev))
1442 dev = vlan_dev_real_dev(dev);
1443
529d0489
MK
1444 /* the same for macvlan devices */
1445 if (netif_is_macvlan(dev))
1446 dev = macvlan_dev_real_dev(dev);
1447
bc5787c6
MM
1448 dev->wanted_features &= ~NETIF_F_LRO;
1449 netdev_update_features(dev);
27660515 1450
22d5969f
MM
1451 if (unlikely(dev->features & NETIF_F_LRO))
1452 netdev_WARN(dev, "failed to disable LRO!\n");
0187bdfb
BH
1453}
1454EXPORT_SYMBOL(dev_disable_lro);
1455
351638e7
JP
1456static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1457 struct net_device *dev)
1458{
1459 struct netdev_notifier_info info;
1460
1461 netdev_notifier_info_init(&info, dev);
1462 return nb->notifier_call(nb, val, &info);
1463}
0187bdfb 1464
881d966b
EB
1465static int dev_boot_phase = 1;
1466
1da177e4
LT
1467/**
1468 * register_netdevice_notifier - register a network notifier block
1469 * @nb: notifier
1470 *
1471 * Register a notifier to be called when network device events occur.
1472 * The notifier passed is linked into the kernel structures and must
1473 * not be reused until it has been unregistered. A negative errno code
1474 * is returned on a failure.
1475 *
1476 * When registered all registration and up events are replayed
4ec93edb 1477 * to the new notifier to allow device to have a race free
1da177e4
LT
1478 * view of the network device list.
1479 */
1480
1481int register_netdevice_notifier(struct notifier_block *nb)
1482{
1483 struct net_device *dev;
fcc5a03a 1484 struct net_device *last;
881d966b 1485 struct net *net;
1da177e4
LT
1486 int err;
1487
1488 rtnl_lock();
f07d5b94 1489 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1490 if (err)
1491 goto unlock;
881d966b
EB
1492 if (dev_boot_phase)
1493 goto unlock;
1494 for_each_net(net) {
1495 for_each_netdev(net, dev) {
351638e7 1496 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
881d966b
EB
1497 err = notifier_to_errno(err);
1498 if (err)
1499 goto rollback;
1500
1501 if (!(dev->flags & IFF_UP))
1502 continue;
1da177e4 1503
351638e7 1504 call_netdevice_notifier(nb, NETDEV_UP, dev);
881d966b 1505 }
1da177e4 1506 }
fcc5a03a
HX
1507
1508unlock:
1da177e4
LT
1509 rtnl_unlock();
1510 return err;
fcc5a03a
HX
1511
1512rollback:
1513 last = dev;
881d966b
EB
1514 for_each_net(net) {
1515 for_each_netdev(net, dev) {
1516 if (dev == last)
8f891489 1517 goto outroll;
fcc5a03a 1518
881d966b 1519 if (dev->flags & IFF_UP) {
351638e7
JP
1520 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1521 dev);
1522 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
881d966b 1523 }
351638e7 1524 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
fcc5a03a 1525 }
fcc5a03a 1526 }
c67625a1 1527
8f891489 1528outroll:
c67625a1 1529 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1530 goto unlock;
1da177e4 1531}
d1b19dff 1532EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1533
1534/**
1535 * unregister_netdevice_notifier - unregister a network notifier block
1536 * @nb: notifier
1537 *
1538 * Unregister a notifier previously registered by
1539 * register_netdevice_notifier(). The notifier is unlinked into the
1540 * kernel structures and may then be reused. A negative errno code
1541 * is returned on a failure.
7d3d43da
EB
1542 *
1543 * After unregistering unregister and down device events are synthesized
1544 * for all devices on the device list to the removed notifier to remove
1545 * the need for special case cleanup code.
1da177e4
LT
1546 */
1547
1548int unregister_netdevice_notifier(struct notifier_block *nb)
1549{
7d3d43da
EB
1550 struct net_device *dev;
1551 struct net *net;
9f514950
HX
1552 int err;
1553
1554 rtnl_lock();
f07d5b94 1555 err = raw_notifier_chain_unregister(&netdev_chain, nb);
7d3d43da
EB
1556 if (err)
1557 goto unlock;
1558
1559 for_each_net(net) {
1560 for_each_netdev(net, dev) {
1561 if (dev->flags & IFF_UP) {
351638e7
JP
1562 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1563 dev);
1564 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
7d3d43da 1565 }
351638e7 1566 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
7d3d43da
EB
1567 }
1568 }
1569unlock:
9f514950
HX
1570 rtnl_unlock();
1571 return err;
1da177e4 1572}
d1b19dff 1573EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4 1574
351638e7
JP
1575/**
1576 * call_netdevice_notifiers_info - call all network notifier blocks
1577 * @val: value passed unmodified to notifier function
1578 * @dev: net_device pointer passed unmodified to notifier function
1579 * @info: notifier information data
1580 *
1581 * Call all network notifier blocks. Parameters and return value
1582 * are as for raw_notifier_call_chain().
1583 */
1584
1d143d9f 1585static int call_netdevice_notifiers_info(unsigned long val,
1586 struct net_device *dev,
1587 struct netdev_notifier_info *info)
351638e7
JP
1588{
1589 ASSERT_RTNL();
1590 netdev_notifier_info_init(info, dev);
1591 return raw_notifier_call_chain(&netdev_chain, val, info);
1592}
351638e7 1593
1da177e4
LT
1594/**
1595 * call_netdevice_notifiers - call all network notifier blocks
1596 * @val: value passed unmodified to notifier function
c4ea43c5 1597 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1598 *
1599 * Call all network notifier blocks. Parameters and return value
f07d5b94 1600 * are as for raw_notifier_call_chain().
1da177e4
LT
1601 */
1602
ad7379d4 1603int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1604{
351638e7
JP
1605 struct netdev_notifier_info info;
1606
1607 return call_netdevice_notifiers_info(val, dev, &info);
1da177e4 1608}
edf947f1 1609EXPORT_SYMBOL(call_netdevice_notifiers);
1da177e4 1610
c5905afb 1611static struct static_key netstamp_needed __read_mostly;
b90e5794 1612#ifdef HAVE_JUMP_LABEL
c5905afb 1613/* We are not allowed to call static_key_slow_dec() from irq context
b90e5794 1614 * If net_disable_timestamp() is called from irq context, defer the
c5905afb 1615 * static_key_slow_dec() calls.
b90e5794
ED
1616 */
1617static atomic_t netstamp_needed_deferred;
1618#endif
1da177e4
LT
1619
1620void net_enable_timestamp(void)
1621{
b90e5794
ED
1622#ifdef HAVE_JUMP_LABEL
1623 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1624
1625 if (deferred) {
1626 while (--deferred)
c5905afb 1627 static_key_slow_dec(&netstamp_needed);
b90e5794
ED
1628 return;
1629 }
1630#endif
c5905afb 1631 static_key_slow_inc(&netstamp_needed);
1da177e4 1632}
d1b19dff 1633EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1634
1635void net_disable_timestamp(void)
1636{
b90e5794
ED
1637#ifdef HAVE_JUMP_LABEL
1638 if (in_interrupt()) {
1639 atomic_inc(&netstamp_needed_deferred);
1640 return;
1641 }
1642#endif
c5905afb 1643 static_key_slow_dec(&netstamp_needed);
1da177e4 1644}
d1b19dff 1645EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1646
3b098e2d 1647static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4 1648{
588f0330 1649 skb->tstamp.tv64 = 0;
c5905afb 1650 if (static_key_false(&netstamp_needed))
a61bbcf2 1651 __net_timestamp(skb);
1da177e4
LT
1652}
1653
588f0330 1654#define net_timestamp_check(COND, SKB) \
c5905afb 1655 if (static_key_false(&netstamp_needed)) { \
588f0330
ED
1656 if ((COND) && !(SKB)->tstamp.tv64) \
1657 __net_timestamp(SKB); \
1658 } \
3b098e2d 1659
1ee481fb 1660bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
79b569f0
DL
1661{
1662 unsigned int len;
1663
1664 if (!(dev->flags & IFF_UP))
1665 return false;
1666
1667 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1668 if (skb->len <= len)
1669 return true;
1670
1671 /* if TSO is enabled, we don't care about the length as the packet
1672 * could be forwarded without being segmented before
1673 */
1674 if (skb_is_gso(skb))
1675 return true;
1676
1677 return false;
1678}
1ee481fb 1679EXPORT_SYMBOL_GPL(is_skb_forwardable);
79b569f0 1680
a0265d28
HX
1681int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1682{
1683 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1684 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1685 atomic_long_inc(&dev->rx_dropped);
1686 kfree_skb(skb);
1687 return NET_RX_DROP;
1688 }
1689 }
1690
1691 if (unlikely(!is_skb_forwardable(dev, skb))) {
1692 atomic_long_inc(&dev->rx_dropped);
1693 kfree_skb(skb);
1694 return NET_RX_DROP;
1695 }
1696
1697 skb_scrub_packet(skb, true);
1698 skb->protocol = eth_type_trans(skb, dev);
1699
1700 return 0;
1701}
1702EXPORT_SYMBOL_GPL(__dev_forward_skb);
1703
44540960
AB
1704/**
1705 * dev_forward_skb - loopback an skb to another netif
1706 *
1707 * @dev: destination network device
1708 * @skb: buffer to forward
1709 *
1710 * return values:
1711 * NET_RX_SUCCESS (no congestion)
6ec82562 1712 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1713 *
1714 * dev_forward_skb can be used for injecting an skb from the
1715 * start_xmit function of one device into the receive queue
1716 * of another device.
1717 *
1718 * The receiving device may be in another namespace, so
1719 * we have to clear all information in the skb that could
1720 * impact namespace isolation.
1721 */
1722int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1723{
a0265d28 1724 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
44540960
AB
1725}
1726EXPORT_SYMBOL_GPL(dev_forward_skb);
1727
71d9dec2
CG
1728static inline int deliver_skb(struct sk_buff *skb,
1729 struct packet_type *pt_prev,
1730 struct net_device *orig_dev)
1731{
1080e512
MT
1732 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1733 return -ENOMEM;
71d9dec2
CG
1734 atomic_inc(&skb->users);
1735 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1736}
1737
c0de08d0
EL
1738static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1739{
a3d744e9 1740 if (!ptype->af_packet_priv || !skb->sk)
c0de08d0
EL
1741 return false;
1742
1743 if (ptype->id_match)
1744 return ptype->id_match(ptype, skb->sk);
1745 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1746 return true;
1747
1748 return false;
1749}
1750
1da177e4
LT
1751/*
1752 * Support routine. Sends outgoing frames to any network
1753 * taps currently in use.
1754 */
1755
f6a78bfc 1756static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1757{
1758 struct packet_type *ptype;
71d9dec2
CG
1759 struct sk_buff *skb2 = NULL;
1760 struct packet_type *pt_prev = NULL;
a61bbcf2 1761
1da177e4
LT
1762 rcu_read_lock();
1763 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1764 /* Never send packets back to the socket
1765 * they originated from - MvS (miquels@drinkel.ow.org)
1766 */
1767 if ((ptype->dev == dev || !ptype->dev) &&
c0de08d0 1768 (!skb_loop_sk(ptype, skb))) {
71d9dec2
CG
1769 if (pt_prev) {
1770 deliver_skb(skb2, pt_prev, skb->dev);
1771 pt_prev = ptype;
1772 continue;
1773 }
1774
1775 skb2 = skb_clone(skb, GFP_ATOMIC);
1da177e4
LT
1776 if (!skb2)
1777 break;
1778
70978182
ED
1779 net_timestamp_set(skb2);
1780
1da177e4
LT
1781 /* skb->nh should be correctly
1782 set by sender, so that the second statement is
1783 just protection against buggy protocols.
1784 */
459a98ed 1785 skb_reset_mac_header(skb2);
1da177e4 1786
d56f90a7 1787 if (skb_network_header(skb2) < skb2->data ||
ced14f68 1788 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
e87cc472
JP
1789 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1790 ntohs(skb2->protocol),
1791 dev->name);
c1d2bbe1 1792 skb_reset_network_header(skb2);
1da177e4
LT
1793 }
1794
b0e380b1 1795 skb2->transport_header = skb2->network_header;
1da177e4 1796 skb2->pkt_type = PACKET_OUTGOING;
71d9dec2 1797 pt_prev = ptype;
1da177e4
LT
1798 }
1799 }
71d9dec2
CG
1800 if (pt_prev)
1801 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1da177e4
LT
1802 rcu_read_unlock();
1803}
1804
2c53040f
BH
1805/**
1806 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
4f57c087
JF
1807 * @dev: Network device
1808 * @txq: number of queues available
1809 *
1810 * If real_num_tx_queues is changed the tc mappings may no longer be
1811 * valid. To resolve this verify the tc mapping remains valid and if
1812 * not NULL the mapping. With no priorities mapping to this
1813 * offset/count pair it will no longer be used. In the worst case TC0
1814 * is invalid nothing can be done so disable priority mappings. If is
1815 * expected that drivers will fix this mapping if they can before
1816 * calling netif_set_real_num_tx_queues.
1817 */
bb134d22 1818static void netif_setup_tc(struct net_device *dev, unsigned int txq)
4f57c087
JF
1819{
1820 int i;
1821 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1822
1823 /* If TC0 is invalidated disable TC mapping */
1824 if (tc->offset + tc->count > txq) {
7b6cd1ce 1825 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
4f57c087
JF
1826 dev->num_tc = 0;
1827 return;
1828 }
1829
1830 /* Invalidated prio to tc mappings set to TC0 */
1831 for (i = 1; i < TC_BITMASK + 1; i++) {
1832 int q = netdev_get_prio_tc_map(dev, i);
1833
1834 tc = &dev->tc_to_txq[q];
1835 if (tc->offset + tc->count > txq) {
7b6cd1ce
JP
1836 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1837 i, q);
4f57c087
JF
1838 netdev_set_prio_tc_map(dev, i, 0);
1839 }
1840 }
1841}
1842
537c00de
AD
1843#ifdef CONFIG_XPS
1844static DEFINE_MUTEX(xps_map_mutex);
1845#define xmap_dereference(P) \
1846 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1847
10cdc3f3
AD
1848static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1849 int cpu, u16 index)
537c00de 1850{
10cdc3f3
AD
1851 struct xps_map *map = NULL;
1852 int pos;
537c00de 1853
10cdc3f3
AD
1854 if (dev_maps)
1855 map = xmap_dereference(dev_maps->cpu_map[cpu]);
537c00de 1856
10cdc3f3
AD
1857 for (pos = 0; map && pos < map->len; pos++) {
1858 if (map->queues[pos] == index) {
537c00de
AD
1859 if (map->len > 1) {
1860 map->queues[pos] = map->queues[--map->len];
1861 } else {
10cdc3f3 1862 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
537c00de
AD
1863 kfree_rcu(map, rcu);
1864 map = NULL;
1865 }
10cdc3f3 1866 break;
537c00de 1867 }
537c00de
AD
1868 }
1869
10cdc3f3
AD
1870 return map;
1871}
1872
024e9679 1873static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
10cdc3f3
AD
1874{
1875 struct xps_dev_maps *dev_maps;
024e9679 1876 int cpu, i;
10cdc3f3
AD
1877 bool active = false;
1878
1879 mutex_lock(&xps_map_mutex);
1880 dev_maps = xmap_dereference(dev->xps_maps);
1881
1882 if (!dev_maps)
1883 goto out_no_maps;
1884
1885 for_each_possible_cpu(cpu) {
024e9679
AD
1886 for (i = index; i < dev->num_tx_queues; i++) {
1887 if (!remove_xps_queue(dev_maps, cpu, i))
1888 break;
1889 }
1890 if (i == dev->num_tx_queues)
10cdc3f3
AD
1891 active = true;
1892 }
1893
1894 if (!active) {
537c00de
AD
1895 RCU_INIT_POINTER(dev->xps_maps, NULL);
1896 kfree_rcu(dev_maps, rcu);
1897 }
1898
024e9679
AD
1899 for (i = index; i < dev->num_tx_queues; i++)
1900 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1901 NUMA_NO_NODE);
1902
537c00de
AD
1903out_no_maps:
1904 mutex_unlock(&xps_map_mutex);
1905}
1906
01c5f864
AD
1907static struct xps_map *expand_xps_map(struct xps_map *map,
1908 int cpu, u16 index)
1909{
1910 struct xps_map *new_map;
1911 int alloc_len = XPS_MIN_MAP_ALLOC;
1912 int i, pos;
1913
1914 for (pos = 0; map && pos < map->len; pos++) {
1915 if (map->queues[pos] != index)
1916 continue;
1917 return map;
1918 }
1919
1920 /* Need to add queue to this CPU's existing map */
1921 if (map) {
1922 if (pos < map->alloc_len)
1923 return map;
1924
1925 alloc_len = map->alloc_len * 2;
1926 }
1927
1928 /* Need to allocate new map to store queue on this CPU's map */
1929 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1930 cpu_to_node(cpu));
1931 if (!new_map)
1932 return NULL;
1933
1934 for (i = 0; i < pos; i++)
1935 new_map->queues[i] = map->queues[i];
1936 new_map->alloc_len = alloc_len;
1937 new_map->len = pos;
1938
1939 return new_map;
1940}
1941
3573540c
MT
1942int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1943 u16 index)
537c00de 1944{
01c5f864 1945 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
537c00de 1946 struct xps_map *map, *new_map;
537c00de 1947 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
01c5f864
AD
1948 int cpu, numa_node_id = -2;
1949 bool active = false;
537c00de
AD
1950
1951 mutex_lock(&xps_map_mutex);
1952
1953 dev_maps = xmap_dereference(dev->xps_maps);
1954
01c5f864
AD
1955 /* allocate memory for queue storage */
1956 for_each_online_cpu(cpu) {
1957 if (!cpumask_test_cpu(cpu, mask))
1958 continue;
1959
1960 if (!new_dev_maps)
1961 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2bb60cb9
AD
1962 if (!new_dev_maps) {
1963 mutex_unlock(&xps_map_mutex);
01c5f864 1964 return -ENOMEM;
2bb60cb9 1965 }
01c5f864
AD
1966
1967 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1968 NULL;
1969
1970 map = expand_xps_map(map, cpu, index);
1971 if (!map)
1972 goto error;
1973
1974 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1975 }
1976
1977 if (!new_dev_maps)
1978 goto out_no_new_maps;
1979
537c00de 1980 for_each_possible_cpu(cpu) {
01c5f864
AD
1981 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1982 /* add queue to CPU maps */
1983 int pos = 0;
1984
1985 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1986 while ((pos < map->len) && (map->queues[pos] != index))
1987 pos++;
1988
1989 if (pos == map->len)
1990 map->queues[map->len++] = index;
537c00de 1991#ifdef CONFIG_NUMA
537c00de
AD
1992 if (numa_node_id == -2)
1993 numa_node_id = cpu_to_node(cpu);
1994 else if (numa_node_id != cpu_to_node(cpu))
1995 numa_node_id = -1;
537c00de 1996#endif
01c5f864
AD
1997 } else if (dev_maps) {
1998 /* fill in the new device map from the old device map */
1999 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2000 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
537c00de 2001 }
01c5f864 2002
537c00de
AD
2003 }
2004
01c5f864
AD
2005 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2006
537c00de 2007 /* Cleanup old maps */
01c5f864
AD
2008 if (dev_maps) {
2009 for_each_possible_cpu(cpu) {
2010 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2011 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2012 if (map && map != new_map)
2013 kfree_rcu(map, rcu);
2014 }
537c00de 2015
01c5f864 2016 kfree_rcu(dev_maps, rcu);
537c00de
AD
2017 }
2018
01c5f864
AD
2019 dev_maps = new_dev_maps;
2020 active = true;
537c00de 2021
01c5f864
AD
2022out_no_new_maps:
2023 /* update Tx queue numa node */
537c00de
AD
2024 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2025 (numa_node_id >= 0) ? numa_node_id :
2026 NUMA_NO_NODE);
2027
01c5f864
AD
2028 if (!dev_maps)
2029 goto out_no_maps;
2030
2031 /* removes queue from unused CPUs */
2032 for_each_possible_cpu(cpu) {
2033 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2034 continue;
2035
2036 if (remove_xps_queue(dev_maps, cpu, index))
2037 active = true;
2038 }
2039
2040 /* free map if not active */
2041 if (!active) {
2042 RCU_INIT_POINTER(dev->xps_maps, NULL);
2043 kfree_rcu(dev_maps, rcu);
2044 }
2045
2046out_no_maps:
537c00de
AD
2047 mutex_unlock(&xps_map_mutex);
2048
2049 return 0;
2050error:
01c5f864
AD
2051 /* remove any maps that we added */
2052 for_each_possible_cpu(cpu) {
2053 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2054 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2055 NULL;
2056 if (new_map && new_map != map)
2057 kfree(new_map);
2058 }
2059
537c00de
AD
2060 mutex_unlock(&xps_map_mutex);
2061
537c00de
AD
2062 kfree(new_dev_maps);
2063 return -ENOMEM;
2064}
2065EXPORT_SYMBOL(netif_set_xps_queue);
2066
2067#endif
f0796d5c
JF
2068/*
2069 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2070 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2071 */
e6484930 2072int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
f0796d5c 2073{
1d24eb48
TH
2074 int rc;
2075
e6484930
TH
2076 if (txq < 1 || txq > dev->num_tx_queues)
2077 return -EINVAL;
f0796d5c 2078
5c56580b
BH
2079 if (dev->reg_state == NETREG_REGISTERED ||
2080 dev->reg_state == NETREG_UNREGISTERING) {
e6484930
TH
2081 ASSERT_RTNL();
2082
1d24eb48
TH
2083 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2084 txq);
bf264145
TH
2085 if (rc)
2086 return rc;
2087
4f57c087
JF
2088 if (dev->num_tc)
2089 netif_setup_tc(dev, txq);
2090
024e9679 2091 if (txq < dev->real_num_tx_queues) {
e6484930 2092 qdisc_reset_all_tx_gt(dev, txq);
024e9679
AD
2093#ifdef CONFIG_XPS
2094 netif_reset_xps_queues_gt(dev, txq);
2095#endif
2096 }
f0796d5c 2097 }
e6484930
TH
2098
2099 dev->real_num_tx_queues = txq;
2100 return 0;
f0796d5c
JF
2101}
2102EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 2103
a953be53 2104#ifdef CONFIG_SYSFS
62fe0b40
BH
2105/**
2106 * netif_set_real_num_rx_queues - set actual number of RX queues used
2107 * @dev: Network device
2108 * @rxq: Actual number of RX queues
2109 *
2110 * This must be called either with the rtnl_lock held or before
2111 * registration of the net device. Returns 0 on success, or a
4e7f7951
BH
2112 * negative error code. If called before registration, it always
2113 * succeeds.
62fe0b40
BH
2114 */
2115int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2116{
2117 int rc;
2118
bd25fa7b
TH
2119 if (rxq < 1 || rxq > dev->num_rx_queues)
2120 return -EINVAL;
2121
62fe0b40
BH
2122 if (dev->reg_state == NETREG_REGISTERED) {
2123 ASSERT_RTNL();
2124
62fe0b40
BH
2125 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2126 rxq);
2127 if (rc)
2128 return rc;
62fe0b40
BH
2129 }
2130
2131 dev->real_num_rx_queues = rxq;
2132 return 0;
2133}
2134EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2135#endif
2136
2c53040f
BH
2137/**
2138 * netif_get_num_default_rss_queues - default number of RSS queues
16917b87
YM
2139 *
2140 * This routine should set an upper limit on the number of RSS queues
2141 * used by default by multiqueue devices.
2142 */
a55b138b 2143int netif_get_num_default_rss_queues(void)
16917b87
YM
2144{
2145 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2146}
2147EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2148
def82a1d 2149static inline void __netif_reschedule(struct Qdisc *q)
56079431 2150{
def82a1d
JP
2151 struct softnet_data *sd;
2152 unsigned long flags;
56079431 2153
def82a1d
JP
2154 local_irq_save(flags);
2155 sd = &__get_cpu_var(softnet_data);
a9cbd588
CG
2156 q->next_sched = NULL;
2157 *sd->output_queue_tailp = q;
2158 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
2159 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2160 local_irq_restore(flags);
2161}
2162
2163void __netif_schedule(struct Qdisc *q)
2164{
2165 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2166 __netif_reschedule(q);
56079431
DV
2167}
2168EXPORT_SYMBOL(__netif_schedule);
2169
e6247027
ED
2170struct dev_kfree_skb_cb {
2171 enum skb_free_reason reason;
2172};
2173
2174static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
56079431 2175{
e6247027
ED
2176 return (struct dev_kfree_skb_cb *)skb->cb;
2177}
2178
2179void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
56079431 2180{
e6247027 2181 unsigned long flags;
56079431 2182
e6247027
ED
2183 if (likely(atomic_read(&skb->users) == 1)) {
2184 smp_rmb();
2185 atomic_set(&skb->users, 0);
2186 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2187 return;
bea3348e 2188 }
e6247027
ED
2189 get_kfree_skb_cb(skb)->reason = reason;
2190 local_irq_save(flags);
2191 skb->next = __this_cpu_read(softnet_data.completion_queue);
2192 __this_cpu_write(softnet_data.completion_queue, skb);
2193 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2194 local_irq_restore(flags);
56079431 2195}
e6247027 2196EXPORT_SYMBOL(__dev_kfree_skb_irq);
56079431 2197
e6247027 2198void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
56079431
DV
2199{
2200 if (in_irq() || irqs_disabled())
e6247027 2201 __dev_kfree_skb_irq(skb, reason);
56079431
DV
2202 else
2203 dev_kfree_skb(skb);
2204}
e6247027 2205EXPORT_SYMBOL(__dev_kfree_skb_any);
56079431
DV
2206
2207
bea3348e
SH
2208/**
2209 * netif_device_detach - mark device as removed
2210 * @dev: network device
2211 *
2212 * Mark device as removed from system and therefore no longer available.
2213 */
56079431
DV
2214void netif_device_detach(struct net_device *dev)
2215{
2216 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2217 netif_running(dev)) {
d543103a 2218 netif_tx_stop_all_queues(dev);
56079431
DV
2219 }
2220}
2221EXPORT_SYMBOL(netif_device_detach);
2222
bea3348e
SH
2223/**
2224 * netif_device_attach - mark device as attached
2225 * @dev: network device
2226 *
2227 * Mark device as attached from system and restart if needed.
2228 */
56079431
DV
2229void netif_device_attach(struct net_device *dev)
2230{
2231 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2232 netif_running(dev)) {
d543103a 2233 netif_tx_wake_all_queues(dev);
4ec93edb 2234 __netdev_watchdog_up(dev);
56079431
DV
2235 }
2236}
2237EXPORT_SYMBOL(netif_device_attach);
2238
36c92474
BH
2239static void skb_warn_bad_offload(const struct sk_buff *skb)
2240{
65e9d2fa 2241 static const netdev_features_t null_features = 0;
36c92474
BH
2242 struct net_device *dev = skb->dev;
2243 const char *driver = "";
2244
c846ad9b
BG
2245 if (!net_ratelimit())
2246 return;
2247
36c92474
BH
2248 if (dev && dev->dev.parent)
2249 driver = dev_driver_string(dev->dev.parent);
2250
2251 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2252 "gso_type=%d ip_summed=%d\n",
65e9d2fa
MM
2253 driver, dev ? &dev->features : &null_features,
2254 skb->sk ? &skb->sk->sk_route_caps : &null_features,
36c92474
BH
2255 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2256 skb_shinfo(skb)->gso_type, skb->ip_summed);
2257}
2258
1da177e4
LT
2259/*
2260 * Invalidate hardware checksum when packet is to be mangled, and
2261 * complete checksum manually on outgoing path.
2262 */
84fa7933 2263int skb_checksum_help(struct sk_buff *skb)
1da177e4 2264{
d3bc23e7 2265 __wsum csum;
663ead3b 2266 int ret = 0, offset;
1da177e4 2267
84fa7933 2268 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
2269 goto out_set_summed;
2270
2271 if (unlikely(skb_shinfo(skb)->gso_size)) {
36c92474
BH
2272 skb_warn_bad_offload(skb);
2273 return -EINVAL;
1da177e4
LT
2274 }
2275
cef401de
ED
2276 /* Before computing a checksum, we should make sure no frag could
2277 * be modified by an external entity : checksum could be wrong.
2278 */
2279 if (skb_has_shared_frag(skb)) {
2280 ret = __skb_linearize(skb);
2281 if (ret)
2282 goto out;
2283 }
2284
55508d60 2285 offset = skb_checksum_start_offset(skb);
a030847e
HX
2286 BUG_ON(offset >= skb_headlen(skb));
2287 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2288
2289 offset += skb->csum_offset;
2290 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2291
2292 if (skb_cloned(skb) &&
2293 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
2294 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2295 if (ret)
2296 goto out;
2297 }
2298
a030847e 2299 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 2300out_set_summed:
1da177e4 2301 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 2302out:
1da177e4
LT
2303 return ret;
2304}
d1b19dff 2305EXPORT_SYMBOL(skb_checksum_help);
1da177e4 2306
53d6471c 2307__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
f6a78bfc 2308{
4b9b1cdf 2309 unsigned int vlan_depth = skb->mac_len;
252e3346 2310 __be16 type = skb->protocol;
f6a78bfc 2311
19acc327
PS
2312 /* Tunnel gso handlers can set protocol to ethernet. */
2313 if (type == htons(ETH_P_TEB)) {
2314 struct ethhdr *eth;
2315
2316 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2317 return 0;
2318
2319 eth = (struct ethhdr *)skb_mac_header(skb);
2320 type = eth->h_proto;
2321 }
2322
4b9b1cdf
NA
2323 /* if skb->protocol is 802.1Q/AD then the header should already be
2324 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2325 * ETH_HLEN otherwise
2326 */
2327 if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2328 if (vlan_depth) {
80019d31 2329 if (WARN_ON(vlan_depth < VLAN_HLEN))
4b9b1cdf
NA
2330 return 0;
2331 vlan_depth -= VLAN_HLEN;
2332 } else {
2333 vlan_depth = ETH_HLEN;
2334 }
2335 do {
2336 struct vlan_hdr *vh;
2337
2338 if (unlikely(!pskb_may_pull(skb,
2339 vlan_depth + VLAN_HLEN)))
2340 return 0;
2341
2342 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2343 type = vh->h_vlan_encapsulated_proto;
2344 vlan_depth += VLAN_HLEN;
2345 } while (type == htons(ETH_P_8021Q) ||
2346 type == htons(ETH_P_8021AD));
7b9c6090
JG
2347 }
2348
53d6471c
VY
2349 *depth = vlan_depth;
2350
ec5f0615
PS
2351 return type;
2352}
2353
2354/**
2355 * skb_mac_gso_segment - mac layer segmentation handler.
2356 * @skb: buffer to segment
2357 * @features: features for the output path (see dev->features)
2358 */
2359struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2360 netdev_features_t features)
2361{
2362 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2363 struct packet_offload *ptype;
53d6471c
VY
2364 int vlan_depth = skb->mac_len;
2365 __be16 type = skb_network_protocol(skb, &vlan_depth);
ec5f0615
PS
2366
2367 if (unlikely(!type))
2368 return ERR_PTR(-EINVAL);
2369
53d6471c 2370 __skb_pull(skb, vlan_depth);
f6a78bfc
HX
2371
2372 rcu_read_lock();
22061d80 2373 list_for_each_entry_rcu(ptype, &offload_base, list) {
f191a1d1 2374 if (ptype->type == type && ptype->callbacks.gso_segment) {
84fa7933 2375 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
05e8ef4a
PS
2376 int err;
2377
f191a1d1 2378 err = ptype->callbacks.gso_send_check(skb);
a430a43d
HX
2379 segs = ERR_PTR(err);
2380 if (err || skb_gso_ok(skb, features))
2381 break;
d56f90a7
ACM
2382 __skb_push(skb, (skb->data -
2383 skb_network_header(skb)));
a430a43d 2384 }
f191a1d1 2385 segs = ptype->callbacks.gso_segment(skb, features);
f6a78bfc
HX
2386 break;
2387 }
2388 }
2389 rcu_read_unlock();
2390
98e399f8 2391 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 2392
f6a78bfc
HX
2393 return segs;
2394}
05e8ef4a
PS
2395EXPORT_SYMBOL(skb_mac_gso_segment);
2396
2397
2398/* openvswitch calls this on rx path, so we need a different check.
2399 */
2400static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2401{
2402 if (tx_path)
2403 return skb->ip_summed != CHECKSUM_PARTIAL;
2404 else
2405 return skb->ip_summed == CHECKSUM_NONE;
2406}
2407
2408/**
2409 * __skb_gso_segment - Perform segmentation on skb.
2410 * @skb: buffer to segment
2411 * @features: features for the output path (see dev->features)
2412 * @tx_path: whether it is called in TX path
2413 *
2414 * This function segments the given skb and returns a list of segments.
2415 *
2416 * It may return NULL if the skb requires no segmentation. This is
2417 * only possible when GSO is used for verifying header integrity.
2418 */
2419struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2420 netdev_features_t features, bool tx_path)
2421{
2422 if (unlikely(skb_needs_check(skb, tx_path))) {
2423 int err;
2424
2425 skb_warn_bad_offload(skb);
2426
a40e0a66 2427 err = skb_cow_head(skb, 0);
2428 if (err < 0)
05e8ef4a
PS
2429 return ERR_PTR(err);
2430 }
2431
68c33163 2432 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3347c960
ED
2433 SKB_GSO_CB(skb)->encap_level = 0;
2434
05e8ef4a
PS
2435 skb_reset_mac_header(skb);
2436 skb_reset_mac_len(skb);
2437
2438 return skb_mac_gso_segment(skb, features);
2439}
12b0004d 2440EXPORT_SYMBOL(__skb_gso_segment);
f6a78bfc 2441
fb286bb2
HX
2442/* Take action when hardware reception checksum errors are detected. */
2443#ifdef CONFIG_BUG
2444void netdev_rx_csum_fault(struct net_device *dev)
2445{
2446 if (net_ratelimit()) {
7b6cd1ce 2447 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
fb286bb2
HX
2448 dump_stack();
2449 }
2450}
2451EXPORT_SYMBOL(netdev_rx_csum_fault);
2452#endif
2453
1da177e4
LT
2454/* Actually, we should eliminate this check as soon as we know, that:
2455 * 1. IOMMU is present and allows to map all the memory.
2456 * 2. No high memory really exists on this machine.
2457 */
2458
c1e756bf 2459static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 2460{
3d3a8533 2461#ifdef CONFIG_HIGHMEM
1da177e4 2462 int i;
5acbbd42 2463 if (!(dev->features & NETIF_F_HIGHDMA)) {
ea2ab693
IC
2464 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2465 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2466 if (PageHighMem(skb_frag_page(frag)))
5acbbd42 2467 return 1;
ea2ab693 2468 }
5acbbd42 2469 }
1da177e4 2470
5acbbd42
FT
2471 if (PCI_DMA_BUS_IS_PHYS) {
2472 struct device *pdev = dev->dev.parent;
1da177e4 2473
9092c658
ED
2474 if (!pdev)
2475 return 0;
5acbbd42 2476 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
ea2ab693
IC
2477 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2478 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
5acbbd42
FT
2479 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2480 return 1;
2481 }
2482 }
3d3a8533 2483#endif
1da177e4
LT
2484 return 0;
2485}
1da177e4 2486
f6a78bfc
HX
2487struct dev_gso_cb {
2488 void (*destructor)(struct sk_buff *skb);
2489};
2490
2491#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2492
2493static void dev_gso_skb_destructor(struct sk_buff *skb)
2494{
2495 struct dev_gso_cb *cb;
2496
289dccbe
ED
2497 kfree_skb_list(skb->next);
2498 skb->next = NULL;
f6a78bfc
HX
2499
2500 cb = DEV_GSO_CB(skb);
2501 if (cb->destructor)
2502 cb->destructor(skb);
2503}
2504
2505/**
2506 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2507 * @skb: buffer to segment
91ecb63c 2508 * @features: device features as applicable to this skb
f6a78bfc
HX
2509 *
2510 * This function segments the given skb and stores the list of segments
2511 * in skb->next.
2512 */
c8f44aff 2513static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
f6a78bfc 2514{
f6a78bfc 2515 struct sk_buff *segs;
576a30eb
HX
2516
2517 segs = skb_gso_segment(skb, features);
2518
2519 /* Verifying header integrity only. */
2520 if (!segs)
2521 return 0;
f6a78bfc 2522
801678c5 2523 if (IS_ERR(segs))
f6a78bfc
HX
2524 return PTR_ERR(segs);
2525
2526 skb->next = segs;
2527 DEV_GSO_CB(skb)->destructor = skb->destructor;
2528 skb->destructor = dev_gso_skb_destructor;
2529
2530 return 0;
2531}
2532
3b392ddb
SH
2533/* If MPLS offload request, verify we are testing hardware MPLS features
2534 * instead of standard features for the netdev.
2535 */
2536#ifdef CONFIG_NET_MPLS_GSO
2537static netdev_features_t net_mpls_features(struct sk_buff *skb,
2538 netdev_features_t features,
2539 __be16 type)
2540{
2541 if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
2542 features &= skb->dev->mpls_features;
2543
2544 return features;
2545}
2546#else
2547static netdev_features_t net_mpls_features(struct sk_buff *skb,
2548 netdev_features_t features,
2549 __be16 type)
2550{
2551 return features;
2552}
2553#endif
2554
c8f44aff 2555static netdev_features_t harmonize_features(struct sk_buff *skb,
c1e756bf 2556 netdev_features_t features)
f01a5236 2557{
53d6471c 2558 int tmp;
3b392ddb
SH
2559 __be16 type;
2560
2561 type = skb_network_protocol(skb, &tmp);
2562 features = net_mpls_features(skb, features, type);
53d6471c 2563
c0d680e5 2564 if (skb->ip_summed != CHECKSUM_NONE &&
3b392ddb 2565 !can_checksum_protocol(features, type)) {
f01a5236 2566 features &= ~NETIF_F_ALL_CSUM;
c1e756bf 2567 } else if (illegal_highdma(skb->dev, skb)) {
f01a5236
JG
2568 features &= ~NETIF_F_SG;
2569 }
2570
2571 return features;
2572}
2573
c1e756bf 2574netdev_features_t netif_skb_features(struct sk_buff *skb)
58e998c6
JG
2575{
2576 __be16 protocol = skb->protocol;
c1e756bf 2577 netdev_features_t features = skb->dev->features;
58e998c6 2578
c1e756bf 2579 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
30b678d8
BH
2580 features &= ~NETIF_F_GSO_MASK;
2581
8ad227ff 2582 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
58e998c6
JG
2583 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2584 protocol = veh->h_vlan_encapsulated_proto;
f01a5236 2585 } else if (!vlan_tx_tag_present(skb)) {
c1e756bf 2586 return harmonize_features(skb, features);
f01a5236 2587 }
58e998c6 2588
c1e756bf 2589 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
8ad227ff 2590 NETIF_F_HW_VLAN_STAG_TX);
f01a5236 2591
cdbaa0bb 2592 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
f01a5236 2593 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
8ad227ff
PM
2594 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2595 NETIF_F_HW_VLAN_STAG_TX;
cdbaa0bb 2596
c1e756bf 2597 return harmonize_features(skb, features);
58e998c6 2598}
c1e756bf 2599EXPORT_SYMBOL(netif_skb_features);
58e998c6 2600
fd2ea0a7 2601int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
f663dd9a 2602 struct netdev_queue *txq)
f6a78bfc 2603{
00829823 2604 const struct net_device_ops *ops = dev->netdev_ops;
572a9d7b 2605 int rc = NETDEV_TX_OK;
ec764bf0 2606 unsigned int skb_len;
00829823 2607
f6a78bfc 2608 if (likely(!skb->next)) {
c8f44aff 2609 netdev_features_t features;
fc741216 2610
93f154b5 2611 /*
25985edc 2612 * If device doesn't need skb->dst, release it right now while
93f154b5
ED
2613 * its hot in this cpu cache
2614 */
adf30907
ED
2615 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2616 skb_dst_drop(skb);
2617
fc741216
JG
2618 features = netif_skb_features(skb);
2619
7b9c6090 2620 if (vlan_tx_tag_present(skb) &&
86a9bad3
PM
2621 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2622 skb = __vlan_put_tag(skb, skb->vlan_proto,
2623 vlan_tx_tag_get(skb));
7b9c6090
JG
2624 if (unlikely(!skb))
2625 goto out;
2626
2627 skb->vlan_tci = 0;
2628 }
2629
fc70fb64
AD
2630 /* If encapsulation offload request, verify we are testing
2631 * hardware encapsulation features instead of standard
2632 * features for the netdev
2633 */
2634 if (skb->encapsulation)
2635 features &= dev->hw_enc_features;
2636
fc741216 2637 if (netif_needs_gso(skb, features)) {
91ecb63c 2638 if (unlikely(dev_gso_segment(skb, features)))
9ccb8975
DM
2639 goto out_kfree_skb;
2640 if (skb->next)
2641 goto gso;
6afff0ca 2642 } else {
02932ce9 2643 if (skb_needs_linearize(skb, features) &&
6afff0ca
JF
2644 __skb_linearize(skb))
2645 goto out_kfree_skb;
2646
2647 /* If packet is not checksummed and device does not
2648 * support checksumming for this protocol, complete
2649 * checksumming here.
2650 */
2651 if (skb->ip_summed == CHECKSUM_PARTIAL) {
fc70fb64
AD
2652 if (skb->encapsulation)
2653 skb_set_inner_transport_header(skb,
2654 skb_checksum_start_offset(skb));
2655 else
2656 skb_set_transport_header(skb,
2657 skb_checksum_start_offset(skb));
03634668 2658 if (!(features & NETIF_F_ALL_CSUM) &&
6afff0ca
JF
2659 skb_checksum_help(skb))
2660 goto out_kfree_skb;
2661 }
9ccb8975
DM
2662 }
2663
b40863c6
ED
2664 if (!list_empty(&ptype_all))
2665 dev_queue_xmit_nit(skb, dev);
2666
ec764bf0 2667 skb_len = skb->len;
d87d04a7 2668 trace_net_dev_start_xmit(skb, dev);
20567661 2669 rc = ops->ndo_start_xmit(skb, dev);
ec764bf0 2670 trace_net_dev_xmit(skb, rc, dev, skb_len);
f663dd9a 2671 if (rc == NETDEV_TX_OK)
08baf561 2672 txq_trans_update(txq);
ac45f602 2673 return rc;
f6a78bfc
HX
2674 }
2675
576a30eb 2676gso:
f6a78bfc
HX
2677 do {
2678 struct sk_buff *nskb = skb->next;
f6a78bfc
HX
2679
2680 skb->next = nskb->next;
2681 nskb->next = NULL;
068a2de5 2682
b40863c6
ED
2683 if (!list_empty(&ptype_all))
2684 dev_queue_xmit_nit(nskb, dev);
2685
ec764bf0 2686 skb_len = nskb->len;
d87d04a7 2687 trace_net_dev_start_xmit(nskb, dev);
f663dd9a 2688 rc = ops->ndo_start_xmit(nskb, dev);
ec764bf0 2689 trace_net_dev_xmit(nskb, rc, dev, skb_len);
ec634fe3 2690 if (unlikely(rc != NETDEV_TX_OK)) {
572a9d7b
PM
2691 if (rc & ~NETDEV_TX_MASK)
2692 goto out_kfree_gso_skb;
f54d9e8d 2693 nskb->next = skb->next;
f6a78bfc
HX
2694 skb->next = nskb;
2695 return rc;
2696 }
08baf561 2697 txq_trans_update(txq);
73466498 2698 if (unlikely(netif_xmit_stopped(txq) && skb->next))
f54d9e8d 2699 return NETDEV_TX_BUSY;
f6a78bfc 2700 } while (skb->next);
4ec93edb 2701
572a9d7b 2702out_kfree_gso_skb:
0c772159 2703 if (likely(skb->next == NULL)) {
572a9d7b 2704 skb->destructor = DEV_GSO_CB(skb)->destructor;
0c772159
SS
2705 consume_skb(skb);
2706 return rc;
2707 }
f6a78bfc
HX
2708out_kfree_skb:
2709 kfree_skb(skb);
7b9c6090 2710out:
572a9d7b 2711 return rc;
f6a78bfc 2712}
a6cc0cfa 2713EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
f6a78bfc 2714
1def9238
ED
2715static void qdisc_pkt_len_init(struct sk_buff *skb)
2716{
2717 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2718
2719 qdisc_skb_cb(skb)->pkt_len = skb->len;
2720
2721 /* To get more precise estimation of bytes sent on wire,
2722 * we add to pkt_len the headers size of all segments
2723 */
2724 if (shinfo->gso_size) {
757b8b1d 2725 unsigned int hdr_len;
15e5a030 2726 u16 gso_segs = shinfo->gso_segs;
1def9238 2727
757b8b1d
ED
2728 /* mac layer + network layer */
2729 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2730
2731 /* + transport layer */
1def9238
ED
2732 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2733 hdr_len += tcp_hdrlen(skb);
2734 else
2735 hdr_len += sizeof(struct udphdr);
15e5a030
JW
2736
2737 if (shinfo->gso_type & SKB_GSO_DODGY)
2738 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2739 shinfo->gso_size);
2740
2741 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
1def9238
ED
2742 }
2743}
2744
bbd8a0d3
KK
2745static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2746 struct net_device *dev,
2747 struct netdev_queue *txq)
2748{
2749 spinlock_t *root_lock = qdisc_lock(q);
a2da570d 2750 bool contended;
bbd8a0d3
KK
2751 int rc;
2752
1def9238 2753 qdisc_pkt_len_init(skb);
a2da570d 2754 qdisc_calculate_pkt_len(skb, q);
79640a4c
ED
2755 /*
2756 * Heuristic to force contended enqueues to serialize on a
2757 * separate lock before trying to get qdisc main lock.
9bf2b8c2
YX
2758 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2759 * often and dequeue packets faster.
79640a4c 2760 */
a2da570d 2761 contended = qdisc_is_running(q);
79640a4c
ED
2762 if (unlikely(contended))
2763 spin_lock(&q->busylock);
2764
bbd8a0d3
KK
2765 spin_lock(root_lock);
2766 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2767 kfree_skb(skb);
2768 rc = NET_XMIT_DROP;
2769 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 2770 qdisc_run_begin(q)) {
bbd8a0d3
KK
2771 /*
2772 * This is a work-conserving queue; there are no old skbs
2773 * waiting to be sent out; and the qdisc is not running -
2774 * xmit the skb directly.
2775 */
7fee226a
ED
2776 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2777 skb_dst_force(skb);
bfe0d029 2778
bfe0d029
ED
2779 qdisc_bstats_update(q, skb);
2780
79640a4c
ED
2781 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2782 if (unlikely(contended)) {
2783 spin_unlock(&q->busylock);
2784 contended = false;
2785 }
bbd8a0d3 2786 __qdisc_run(q);
79640a4c 2787 } else
bc135b23 2788 qdisc_run_end(q);
bbd8a0d3
KK
2789
2790 rc = NET_XMIT_SUCCESS;
2791 } else {
7fee226a 2792 skb_dst_force(skb);
a2da570d 2793 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
79640a4c
ED
2794 if (qdisc_run_begin(q)) {
2795 if (unlikely(contended)) {
2796 spin_unlock(&q->busylock);
2797 contended = false;
2798 }
2799 __qdisc_run(q);
2800 }
bbd8a0d3
KK
2801 }
2802 spin_unlock(root_lock);
79640a4c
ED
2803 if (unlikely(contended))
2804 spin_unlock(&q->busylock);
bbd8a0d3
KK
2805 return rc;
2806}
2807
86f8515f 2808#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
5bc1421e
NH
2809static void skb_update_prio(struct sk_buff *skb)
2810{
6977a79d 2811 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
5bc1421e 2812
91c68ce2
ED
2813 if (!skb->priority && skb->sk && map) {
2814 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2815
2816 if (prioidx < map->priomap_len)
2817 skb->priority = map->priomap[prioidx];
2818 }
5bc1421e
NH
2819}
2820#else
2821#define skb_update_prio(skb)
2822#endif
2823
745e20f1 2824static DEFINE_PER_CPU(int, xmit_recursion);
11a766ce 2825#define RECURSION_LIMIT 10
745e20f1 2826
95603e22
MM
2827/**
2828 * dev_loopback_xmit - loop back @skb
2829 * @skb: buffer to transmit
2830 */
2831int dev_loopback_xmit(struct sk_buff *skb)
2832{
2833 skb_reset_mac_header(skb);
2834 __skb_pull(skb, skb_network_offset(skb));
2835 skb->pkt_type = PACKET_LOOPBACK;
2836 skb->ip_summed = CHECKSUM_UNNECESSARY;
2837 WARN_ON(!skb_dst(skb));
2838 skb_dst_force(skb);
2839 netif_rx_ni(skb);
2840 return 0;
2841}
2842EXPORT_SYMBOL(dev_loopback_xmit);
2843
d29f749e 2844/**
9d08dd3d 2845 * __dev_queue_xmit - transmit a buffer
d29f749e 2846 * @skb: buffer to transmit
9d08dd3d 2847 * @accel_priv: private data used for L2 forwarding offload
d29f749e
DJ
2848 *
2849 * Queue a buffer for transmission to a network device. The caller must
2850 * have set the device and priority and built the buffer before calling
2851 * this function. The function can be called from an interrupt.
2852 *
2853 * A negative errno code is returned on a failure. A success does not
2854 * guarantee the frame will be transmitted as it may be dropped due
2855 * to congestion or traffic shaping.
2856 *
2857 * -----------------------------------------------------------------------------------
2858 * I notice this method can also return errors from the queue disciplines,
2859 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2860 * be positive.
2861 *
2862 * Regardless of the return value, the skb is consumed, so it is currently
2863 * difficult to retry a send to this method. (You can bump the ref count
2864 * before sending to hold a reference for retry if you are careful.)
2865 *
2866 * When calling this method, interrupts MUST be enabled. This is because
2867 * the BH enable code must have IRQs enabled so that it will not deadlock.
2868 * --BLG
2869 */
0a59f3a9 2870static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
1da177e4
LT
2871{
2872 struct net_device *dev = skb->dev;
dc2b4847 2873 struct netdev_queue *txq;
1da177e4
LT
2874 struct Qdisc *q;
2875 int rc = -ENOMEM;
2876
6d1ccff6
ED
2877 skb_reset_mac_header(skb);
2878
4ec93edb
YH
2879 /* Disable soft irqs for various locks below. Also
2880 * stops preemption for RCU.
1da177e4 2881 */
4ec93edb 2882 rcu_read_lock_bh();
1da177e4 2883
5bc1421e
NH
2884 skb_update_prio(skb);
2885
f663dd9a 2886 txq = netdev_pick_tx(dev, skb, accel_priv);
a898def2 2887 q = rcu_dereference_bh(txq->qdisc);
37437bb2 2888
1da177e4 2889#ifdef CONFIG_NET_CLS_ACT
d1b19dff 2890 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4 2891#endif
cf66ba58 2892 trace_net_dev_queue(skb);
1da177e4 2893 if (q->enqueue) {
bbd8a0d3 2894 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 2895 goto out;
1da177e4
LT
2896 }
2897
2898 /* The device has no queue. Common case for software devices:
2899 loopback, all the sorts of tunnels...
2900
932ff279
HX
2901 Really, it is unlikely that netif_tx_lock protection is necessary
2902 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
2903 counters.)
2904 However, it is possible, that they rely on protection
2905 made by us here.
2906
2907 Check this and shot the lock. It is not prone from deadlocks.
2908 Either shot noqueue qdisc, it is even simpler 8)
2909 */
2910 if (dev->flags & IFF_UP) {
2911 int cpu = smp_processor_id(); /* ok because BHs are off */
2912
c773e847 2913 if (txq->xmit_lock_owner != cpu) {
1da177e4 2914
745e20f1
ED
2915 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2916 goto recursion_alert;
2917
c773e847 2918 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 2919
73466498 2920 if (!netif_xmit_stopped(txq)) {
745e20f1 2921 __this_cpu_inc(xmit_recursion);
f663dd9a 2922 rc = dev_hard_start_xmit(skb, dev, txq);
745e20f1 2923 __this_cpu_dec(xmit_recursion);
572a9d7b 2924 if (dev_xmit_complete(rc)) {
c773e847 2925 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2926 goto out;
2927 }
2928 }
c773e847 2929 HARD_TX_UNLOCK(dev, txq);
e87cc472
JP
2930 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2931 dev->name);
1da177e4
LT
2932 } else {
2933 /* Recursion is detected! It is possible,
745e20f1
ED
2934 * unfortunately
2935 */
2936recursion_alert:
e87cc472
JP
2937 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2938 dev->name);
1da177e4
LT
2939 }
2940 }
2941
2942 rc = -ENETDOWN;
d4828d85 2943 rcu_read_unlock_bh();
1da177e4 2944
015f0688 2945 atomic_long_inc(&dev->tx_dropped);
1da177e4
LT
2946 kfree_skb(skb);
2947 return rc;
2948out:
d4828d85 2949 rcu_read_unlock_bh();
1da177e4
LT
2950 return rc;
2951}
f663dd9a
JW
2952
2953int dev_queue_xmit(struct sk_buff *skb)
2954{
2955 return __dev_queue_xmit(skb, NULL);
2956}
d1b19dff 2957EXPORT_SYMBOL(dev_queue_xmit);
1da177e4 2958
f663dd9a
JW
2959int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2960{
2961 return __dev_queue_xmit(skb, accel_priv);
2962}
2963EXPORT_SYMBOL(dev_queue_xmit_accel);
2964
1da177e4
LT
2965
2966/*=======================================================================
2967 Receiver routines
2968 =======================================================================*/
2969
6b2bedc3 2970int netdev_max_backlog __read_mostly = 1000;
c9e6bc64
ED
2971EXPORT_SYMBOL(netdev_max_backlog);
2972
3b098e2d 2973int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
2974int netdev_budget __read_mostly = 300;
2975int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 2976
eecfd7c4
ED
2977/* Called with irq disabled */
2978static inline void ____napi_schedule(struct softnet_data *sd,
2979 struct napi_struct *napi)
2980{
2981 list_add_tail(&napi->poll_list, &sd->poll_list);
2982 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2983}
2984
bfb564e7
KK
2985#ifdef CONFIG_RPS
2986
2987/* One global table that all flow-based protocols share. */
6e3f7faf 2988struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
bfb564e7
KK
2989EXPORT_SYMBOL(rps_sock_flow_table);
2990
c5905afb 2991struct static_key rps_needed __read_mostly;
adc9300e 2992
c445477d
BH
2993static struct rps_dev_flow *
2994set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2995 struct rps_dev_flow *rflow, u16 next_cpu)
2996{
09994d1b 2997 if (next_cpu != RPS_NO_CPU) {
c445477d
BH
2998#ifdef CONFIG_RFS_ACCEL
2999 struct netdev_rx_queue *rxqueue;
3000 struct rps_dev_flow_table *flow_table;
3001 struct rps_dev_flow *old_rflow;
3002 u32 flow_id;
3003 u16 rxq_index;
3004 int rc;
3005
3006 /* Should we steer this flow to a different hardware queue? */
69a19ee6
BH
3007 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3008 !(dev->features & NETIF_F_NTUPLE))
c445477d
BH
3009 goto out;
3010 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3011 if (rxq_index == skb_get_rx_queue(skb))
3012 goto out;
3013
3014 rxqueue = dev->_rx + rxq_index;
3015 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3016 if (!flow_table)
3017 goto out;
61b905da 3018 flow_id = skb_get_hash(skb) & flow_table->mask;
c445477d
BH
3019 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3020 rxq_index, flow_id);
3021 if (rc < 0)
3022 goto out;
3023 old_rflow = rflow;
3024 rflow = &flow_table->flows[flow_id];
c445477d
BH
3025 rflow->filter = rc;
3026 if (old_rflow->filter == rflow->filter)
3027 old_rflow->filter = RPS_NO_FILTER;
3028 out:
3029#endif
3030 rflow->last_qtail =
09994d1b 3031 per_cpu(softnet_data, next_cpu).input_queue_head;
c445477d
BH
3032 }
3033
09994d1b 3034 rflow->cpu = next_cpu;
c445477d
BH
3035 return rflow;
3036}
3037
bfb564e7
KK
3038/*
3039 * get_rps_cpu is called from netif_receive_skb and returns the target
3040 * CPU from the RPS map of the receiving queue for a given skb.
3041 * rcu_read_lock must be held on entry.
3042 */
3043static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3044 struct rps_dev_flow **rflowp)
3045{
3046 struct netdev_rx_queue *rxqueue;
6e3f7faf 3047 struct rps_map *map;
bfb564e7
KK
3048 struct rps_dev_flow_table *flow_table;
3049 struct rps_sock_flow_table *sock_flow_table;
3050 int cpu = -1;
3051 u16 tcpu;
61b905da 3052 u32 hash;
bfb564e7
KK
3053
3054 if (skb_rx_queue_recorded(skb)) {
3055 u16 index = skb_get_rx_queue(skb);
62fe0b40
BH
3056 if (unlikely(index >= dev->real_num_rx_queues)) {
3057 WARN_ONCE(dev->real_num_rx_queues > 1,
3058 "%s received packet on queue %u, but number "
3059 "of RX queues is %u\n",
3060 dev->name, index, dev->real_num_rx_queues);
bfb564e7
KK
3061 goto done;
3062 }
3063 rxqueue = dev->_rx + index;
3064 } else
3065 rxqueue = dev->_rx;
3066
6e3f7faf
ED
3067 map = rcu_dereference(rxqueue->rps_map);
3068 if (map) {
85875236 3069 if (map->len == 1 &&
33d480ce 3070 !rcu_access_pointer(rxqueue->rps_flow_table)) {
6febfca9
CG
3071 tcpu = map->cpus[0];
3072 if (cpu_online(tcpu))
3073 cpu = tcpu;
3074 goto done;
3075 }
33d480ce 3076 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
bfb564e7 3077 goto done;
6febfca9 3078 }
bfb564e7 3079
2d47b459 3080 skb_reset_network_header(skb);
61b905da
TH
3081 hash = skb_get_hash(skb);
3082 if (!hash)
bfb564e7
KK
3083 goto done;
3084
fec5e652
TH
3085 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3086 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3087 if (flow_table && sock_flow_table) {
3088 u16 next_cpu;
3089 struct rps_dev_flow *rflow;
3090
61b905da 3091 rflow = &flow_table->flows[hash & flow_table->mask];
fec5e652
TH
3092 tcpu = rflow->cpu;
3093
61b905da 3094 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
fec5e652
TH
3095
3096 /*
3097 * If the desired CPU (where last recvmsg was done) is
3098 * different from current CPU (one in the rx-queue flow
3099 * table entry), switch if one of the following holds:
3100 * - Current CPU is unset (equal to RPS_NO_CPU).
3101 * - Current CPU is offline.
3102 * - The current CPU's queue tail has advanced beyond the
3103 * last packet that was enqueued using this table entry.
3104 * This guarantees that all previous packets for the flow
3105 * have been dequeued, thus preserving in order delivery.
3106 */
3107 if (unlikely(tcpu != next_cpu) &&
3108 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3109 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
baefa31d
TH
3110 rflow->last_qtail)) >= 0)) {
3111 tcpu = next_cpu;
c445477d 3112 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
baefa31d 3113 }
c445477d 3114
fec5e652
TH
3115 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3116 *rflowp = rflow;
3117 cpu = tcpu;
3118 goto done;
3119 }
3120 }
3121
0a9627f2 3122 if (map) {
61b905da 3123 tcpu = map->cpus[((u64) hash * map->len) >> 32];
0a9627f2
TH
3124
3125 if (cpu_online(tcpu)) {
3126 cpu = tcpu;
3127 goto done;
3128 }
3129 }
3130
3131done:
0a9627f2
TH
3132 return cpu;
3133}
3134
c445477d
BH
3135#ifdef CONFIG_RFS_ACCEL
3136
3137/**
3138 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3139 * @dev: Device on which the filter was set
3140 * @rxq_index: RX queue index
3141 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3142 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3143 *
3144 * Drivers that implement ndo_rx_flow_steer() should periodically call
3145 * this function for each installed filter and remove the filters for
3146 * which it returns %true.
3147 */
3148bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3149 u32 flow_id, u16 filter_id)
3150{
3151 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3152 struct rps_dev_flow_table *flow_table;
3153 struct rps_dev_flow *rflow;
3154 bool expire = true;
3155 int cpu;
3156
3157 rcu_read_lock();
3158 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3159 if (flow_table && flow_id <= flow_table->mask) {
3160 rflow = &flow_table->flows[flow_id];
3161 cpu = ACCESS_ONCE(rflow->cpu);
3162 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3163 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3164 rflow->last_qtail) <
3165 (int)(10 * flow_table->mask)))
3166 expire = false;
3167 }
3168 rcu_read_unlock();
3169 return expire;
3170}
3171EXPORT_SYMBOL(rps_may_expire_flow);
3172
3173#endif /* CONFIG_RFS_ACCEL */
3174
0a9627f2 3175/* Called from hardirq (IPI) context */
e36fa2f7 3176static void rps_trigger_softirq(void *data)
0a9627f2 3177{
e36fa2f7
ED
3178 struct softnet_data *sd = data;
3179
eecfd7c4 3180 ____napi_schedule(sd, &sd->backlog);
dee42870 3181 sd->received_rps++;
0a9627f2 3182}
e36fa2f7 3183
fec5e652 3184#endif /* CONFIG_RPS */
0a9627f2 3185
e36fa2f7
ED
3186/*
3187 * Check if this softnet_data structure is another cpu one
3188 * If yes, queue it to our IPI list and return 1
3189 * If no, return 0
3190 */
3191static int rps_ipi_queued(struct softnet_data *sd)
3192{
3193#ifdef CONFIG_RPS
3194 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3195
3196 if (sd != mysd) {
3197 sd->rps_ipi_next = mysd->rps_ipi_list;
3198 mysd->rps_ipi_list = sd;
3199
3200 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3201 return 1;
3202 }
3203#endif /* CONFIG_RPS */
3204 return 0;
3205}
3206
99bbc707
WB
3207#ifdef CONFIG_NET_FLOW_LIMIT
3208int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3209#endif
3210
3211static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3212{
3213#ifdef CONFIG_NET_FLOW_LIMIT
3214 struct sd_flow_limit *fl;
3215 struct softnet_data *sd;
3216 unsigned int old_flow, new_flow;
3217
3218 if (qlen < (netdev_max_backlog >> 1))
3219 return false;
3220
3221 sd = &__get_cpu_var(softnet_data);
3222
3223 rcu_read_lock();
3224 fl = rcu_dereference(sd->flow_limit);
3225 if (fl) {
3958afa1 3226 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
99bbc707
WB
3227 old_flow = fl->history[fl->history_head];
3228 fl->history[fl->history_head] = new_flow;
3229
3230 fl->history_head++;
3231 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3232
3233 if (likely(fl->buckets[old_flow]))
3234 fl->buckets[old_flow]--;
3235
3236 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3237 fl->count++;
3238 rcu_read_unlock();
3239 return true;
3240 }
3241 }
3242 rcu_read_unlock();
3243#endif
3244 return false;
3245}
3246
0a9627f2
TH
3247/*
3248 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3249 * queue (may be a remote CPU queue).
3250 */
fec5e652
TH
3251static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3252 unsigned int *qtail)
0a9627f2 3253{
e36fa2f7 3254 struct softnet_data *sd;
0a9627f2 3255 unsigned long flags;
99bbc707 3256 unsigned int qlen;
0a9627f2 3257
e36fa2f7 3258 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
3259
3260 local_irq_save(flags);
0a9627f2 3261
e36fa2f7 3262 rps_lock(sd);
99bbc707
WB
3263 qlen = skb_queue_len(&sd->input_pkt_queue);
3264 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
6e7676c1 3265 if (skb_queue_len(&sd->input_pkt_queue)) {
0a9627f2 3266enqueue:
e36fa2f7 3267 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 3268 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 3269 rps_unlock(sd);
152102c7 3270 local_irq_restore(flags);
0a9627f2
TH
3271 return NET_RX_SUCCESS;
3272 }
3273
ebda37c2
ED
3274 /* Schedule NAPI for backlog device
3275 * We can use non atomic operation since we own the queue lock
3276 */
3277 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 3278 if (!rps_ipi_queued(sd))
eecfd7c4 3279 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
3280 }
3281 goto enqueue;
3282 }
3283
dee42870 3284 sd->dropped++;
e36fa2f7 3285 rps_unlock(sd);
0a9627f2 3286
0a9627f2
TH
3287 local_irq_restore(flags);
3288
caf586e5 3289 atomic_long_inc(&skb->dev->rx_dropped);
0a9627f2
TH
3290 kfree_skb(skb);
3291 return NET_RX_DROP;
3292}
1da177e4 3293
ae78dbfa 3294static int netif_rx_internal(struct sk_buff *skb)
1da177e4 3295{
b0e28f1e 3296 int ret;
1da177e4 3297
588f0330 3298 net_timestamp_check(netdev_tstamp_prequeue, skb);
1da177e4 3299
cf66ba58 3300 trace_netif_rx(skb);
df334545 3301#ifdef CONFIG_RPS
c5905afb 3302 if (static_key_false(&rps_needed)) {
fec5e652 3303 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
3304 int cpu;
3305
cece1945 3306 preempt_disable();
b0e28f1e 3307 rcu_read_lock();
fec5e652
TH
3308
3309 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
3310 if (cpu < 0)
3311 cpu = smp_processor_id();
fec5e652
TH
3312
3313 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3314
b0e28f1e 3315 rcu_read_unlock();
cece1945 3316 preempt_enable();
adc9300e
ED
3317 } else
3318#endif
fec5e652
TH
3319 {
3320 unsigned int qtail;
3321 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3322 put_cpu();
3323 }
b0e28f1e 3324 return ret;
1da177e4 3325}
ae78dbfa
BH
3326
3327/**
3328 * netif_rx - post buffer to the network code
3329 * @skb: buffer to post
3330 *
3331 * This function receives a packet from a device driver and queues it for
3332 * the upper (protocol) levels to process. It always succeeds. The buffer
3333 * may be dropped during processing for congestion control or by the
3334 * protocol layers.
3335 *
3336 * return values:
3337 * NET_RX_SUCCESS (no congestion)
3338 * NET_RX_DROP (packet was dropped)
3339 *
3340 */
3341
3342int netif_rx(struct sk_buff *skb)
3343{
3344 trace_netif_rx_entry(skb);
3345
3346 return netif_rx_internal(skb);
3347}
d1b19dff 3348EXPORT_SYMBOL(netif_rx);
1da177e4
LT
3349
3350int netif_rx_ni(struct sk_buff *skb)
3351{
3352 int err;
3353
ae78dbfa
BH
3354 trace_netif_rx_ni_entry(skb);
3355
1da177e4 3356 preempt_disable();
ae78dbfa 3357 err = netif_rx_internal(skb);
1da177e4
LT
3358 if (local_softirq_pending())
3359 do_softirq();
3360 preempt_enable();
3361
3362 return err;
3363}
1da177e4
LT
3364EXPORT_SYMBOL(netif_rx_ni);
3365
1da177e4
LT
3366static void net_tx_action(struct softirq_action *h)
3367{
3368 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3369
3370 if (sd->completion_queue) {
3371 struct sk_buff *clist;
3372
3373 local_irq_disable();
3374 clist = sd->completion_queue;
3375 sd->completion_queue = NULL;
3376 local_irq_enable();
3377
3378 while (clist) {
3379 struct sk_buff *skb = clist;
3380 clist = clist->next;
3381
547b792c 3382 WARN_ON(atomic_read(&skb->users));
e6247027
ED
3383 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3384 trace_consume_skb(skb);
3385 else
3386 trace_kfree_skb(skb, net_tx_action);
1da177e4
LT
3387 __kfree_skb(skb);
3388 }
3389 }
3390
3391 if (sd->output_queue) {
37437bb2 3392 struct Qdisc *head;
1da177e4
LT
3393
3394 local_irq_disable();
3395 head = sd->output_queue;
3396 sd->output_queue = NULL;
a9cbd588 3397 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
3398 local_irq_enable();
3399
3400 while (head) {
37437bb2
DM
3401 struct Qdisc *q = head;
3402 spinlock_t *root_lock;
3403
1da177e4
LT
3404 head = head->next_sched;
3405
5fb66229 3406 root_lock = qdisc_lock(q);
37437bb2 3407 if (spin_trylock(root_lock)) {
4e857c58 3408 smp_mb__before_atomic();
def82a1d
JP
3409 clear_bit(__QDISC_STATE_SCHED,
3410 &q->state);
37437bb2
DM
3411 qdisc_run(q);
3412 spin_unlock(root_lock);
1da177e4 3413 } else {
195648bb 3414 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 3415 &q->state)) {
195648bb 3416 __netif_reschedule(q);
e8a83e10 3417 } else {
4e857c58 3418 smp_mb__before_atomic();
e8a83e10
JP
3419 clear_bit(__QDISC_STATE_SCHED,
3420 &q->state);
3421 }
1da177e4
LT
3422 }
3423 }
3424 }
3425}
3426
ab95bfe0
JP
3427#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3428 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
da678292
MM
3429/* This hook is defined here for ATM LANE */
3430int (*br_fdb_test_addr_hook)(struct net_device *dev,
3431 unsigned char *addr) __read_mostly;
4fb019a0 3432EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 3433#endif
1da177e4 3434
1da177e4
LT
3435#ifdef CONFIG_NET_CLS_ACT
3436/* TODO: Maybe we should just force sch_ingress to be compiled in
3437 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3438 * a compare and 2 stores extra right now if we dont have it on
3439 * but have CONFIG_NET_CLS_ACT
25985edc
LDM
3440 * NOTE: This doesn't stop any functionality; if you dont have
3441 * the ingress scheduler, you just can't add policies on ingress.
1da177e4
LT
3442 *
3443 */
24824a09 3444static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
1da177e4 3445{
1da177e4 3446 struct net_device *dev = skb->dev;
f697c3e8 3447 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
3448 int result = TC_ACT_OK;
3449 struct Qdisc *q;
4ec93edb 3450
de384830 3451 if (unlikely(MAX_RED_LOOP < ttl++)) {
e87cc472
JP
3452 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3453 skb->skb_iif, dev->ifindex);
f697c3e8
HX
3454 return TC_ACT_SHOT;
3455 }
1da177e4 3456
f697c3e8
HX
3457 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3458 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 3459
83874000 3460 q = rxq->qdisc;
8d50b53d 3461 if (q != &noop_qdisc) {
83874000 3462 spin_lock(qdisc_lock(q));
a9312ae8
DM
3463 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3464 result = qdisc_enqueue_root(skb, q);
83874000
DM
3465 spin_unlock(qdisc_lock(q));
3466 }
f697c3e8
HX
3467
3468 return result;
3469}
86e65da9 3470
f697c3e8
HX
3471static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3472 struct packet_type **pt_prev,
3473 int *ret, struct net_device *orig_dev)
3474{
24824a09
ED
3475 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3476
3477 if (!rxq || rxq->qdisc == &noop_qdisc)
f697c3e8 3478 goto out;
1da177e4 3479
f697c3e8
HX
3480 if (*pt_prev) {
3481 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3482 *pt_prev = NULL;
1da177e4
LT
3483 }
3484
24824a09 3485 switch (ing_filter(skb, rxq)) {
f697c3e8
HX
3486 case TC_ACT_SHOT:
3487 case TC_ACT_STOLEN:
3488 kfree_skb(skb);
3489 return NULL;
3490 }
3491
3492out:
3493 skb->tc_verd = 0;
3494 return skb;
1da177e4
LT
3495}
3496#endif
3497
ab95bfe0
JP
3498/**
3499 * netdev_rx_handler_register - register receive handler
3500 * @dev: device to register a handler for
3501 * @rx_handler: receive handler to register
93e2c32b 3502 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0 3503 *
e227867f 3504 * Register a receive handler for a device. This handler will then be
ab95bfe0
JP
3505 * called from __netif_receive_skb. A negative errno code is returned
3506 * on a failure.
3507 *
3508 * The caller must hold the rtnl_mutex.
8a4eb573
JP
3509 *
3510 * For a general description of rx_handler, see enum rx_handler_result.
ab95bfe0
JP
3511 */
3512int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
3513 rx_handler_func_t *rx_handler,
3514 void *rx_handler_data)
ab95bfe0
JP
3515{
3516 ASSERT_RTNL();
3517
3518 if (dev->rx_handler)
3519 return -EBUSY;
3520
00cfec37 3521 /* Note: rx_handler_data must be set before rx_handler */
93e2c32b 3522 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
3523 rcu_assign_pointer(dev->rx_handler, rx_handler);
3524
3525 return 0;
3526}
3527EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3528
3529/**
3530 * netdev_rx_handler_unregister - unregister receive handler
3531 * @dev: device to unregister a handler from
3532 *
166ec369 3533 * Unregister a receive handler from a device.
ab95bfe0
JP
3534 *
3535 * The caller must hold the rtnl_mutex.
3536 */
3537void netdev_rx_handler_unregister(struct net_device *dev)
3538{
3539
3540 ASSERT_RTNL();
a9b3cd7f 3541 RCU_INIT_POINTER(dev->rx_handler, NULL);
00cfec37
ED
3542 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3543 * section has a guarantee to see a non NULL rx_handler_data
3544 * as well.
3545 */
3546 synchronize_net();
a9b3cd7f 3547 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
ab95bfe0
JP
3548}
3549EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3550
b4b9e355
MG
3551/*
3552 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3553 * the special handling of PFMEMALLOC skbs.
3554 */
3555static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3556{
3557 switch (skb->protocol) {
2b8837ae
JP
3558 case htons(ETH_P_ARP):
3559 case htons(ETH_P_IP):
3560 case htons(ETH_P_IPV6):
3561 case htons(ETH_P_8021Q):
3562 case htons(ETH_P_8021AD):
b4b9e355
MG
3563 return true;
3564 default:
3565 return false;
3566 }
3567}
3568
9754e293 3569static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
1da177e4
LT
3570{
3571 struct packet_type *ptype, *pt_prev;
ab95bfe0 3572 rx_handler_func_t *rx_handler;
f2ccd8fa 3573 struct net_device *orig_dev;
63d8ea7f 3574 struct net_device *null_or_dev;
8a4eb573 3575 bool deliver_exact = false;
1da177e4 3576 int ret = NET_RX_DROP;
252e3346 3577 __be16 type;
1da177e4 3578
588f0330 3579 net_timestamp_check(!netdev_tstamp_prequeue, skb);
81bbb3d4 3580
cf66ba58 3581 trace_netif_receive_skb(skb);
9b22ea56 3582
cc9bd5ce 3583 orig_dev = skb->dev;
8f903c70 3584
c1d2bbe1 3585 skb_reset_network_header(skb);
fda55eca
ED
3586 if (!skb_transport_header_was_set(skb))
3587 skb_reset_transport_header(skb);
0b5c9db1 3588 skb_reset_mac_len(skb);
1da177e4
LT
3589
3590 pt_prev = NULL;
3591
3592 rcu_read_lock();
3593
63d8ea7f 3594another_round:
b6858177 3595 skb->skb_iif = skb->dev->ifindex;
63d8ea7f
DM
3596
3597 __this_cpu_inc(softnet_data.processed);
3598
8ad227ff
PM
3599 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3600 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
bcc6d479
JP
3601 skb = vlan_untag(skb);
3602 if (unlikely(!skb))
b4b9e355 3603 goto unlock;
bcc6d479
JP
3604 }
3605
1da177e4
LT
3606#ifdef CONFIG_NET_CLS_ACT
3607 if (skb->tc_verd & TC_NCLS) {
3608 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3609 goto ncls;
3610 }
3611#endif
3612
9754e293 3613 if (pfmemalloc)
b4b9e355
MG
3614 goto skip_taps;
3615
1da177e4 3616 list_for_each_entry_rcu(ptype, &ptype_all, list) {
63d8ea7f 3617 if (!ptype->dev || ptype->dev == skb->dev) {
4ec93edb 3618 if (pt_prev)
f2ccd8fa 3619 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3620 pt_prev = ptype;
3621 }
3622 }
3623
b4b9e355 3624skip_taps:
1da177e4 3625#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
3626 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3627 if (!skb)
b4b9e355 3628 goto unlock;
1da177e4
LT
3629ncls:
3630#endif
3631
9754e293 3632 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
b4b9e355
MG
3633 goto drop;
3634
2425717b
JF
3635 if (vlan_tx_tag_present(skb)) {
3636 if (pt_prev) {
3637 ret = deliver_skb(skb, pt_prev, orig_dev);
3638 pt_prev = NULL;
3639 }
48cc32d3 3640 if (vlan_do_receive(&skb))
2425717b
JF
3641 goto another_round;
3642 else if (unlikely(!skb))
b4b9e355 3643 goto unlock;
2425717b
JF
3644 }
3645
48cc32d3 3646 rx_handler = rcu_dereference(skb->dev->rx_handler);
ab95bfe0
JP
3647 if (rx_handler) {
3648 if (pt_prev) {
3649 ret = deliver_skb(skb, pt_prev, orig_dev);
3650 pt_prev = NULL;
3651 }
8a4eb573
JP
3652 switch (rx_handler(&skb)) {
3653 case RX_HANDLER_CONSUMED:
3bc1b1ad 3654 ret = NET_RX_SUCCESS;
b4b9e355 3655 goto unlock;
8a4eb573 3656 case RX_HANDLER_ANOTHER:
63d8ea7f 3657 goto another_round;
8a4eb573
JP
3658 case RX_HANDLER_EXACT:
3659 deliver_exact = true;
3660 case RX_HANDLER_PASS:
3661 break;
3662 default:
3663 BUG();
3664 }
ab95bfe0 3665 }
1da177e4 3666
d4b812de
ED
3667 if (unlikely(vlan_tx_tag_present(skb))) {
3668 if (vlan_tx_tag_get_id(skb))
3669 skb->pkt_type = PACKET_OTHERHOST;
3670 /* Note: we might in the future use prio bits
3671 * and set skb->priority like in vlan_do_receive()
3672 * For the time being, just ignore Priority Code Point
3673 */
3674 skb->vlan_tci = 0;
3675 }
48cc32d3 3676
63d8ea7f 3677 /* deliver only exact match when indicated */
8a4eb573 3678 null_or_dev = deliver_exact ? skb->dev : NULL;
1f3c8804 3679
1da177e4 3680 type = skb->protocol;
82d8a867
PE
3681 list_for_each_entry_rcu(ptype,
3682 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
63d8ea7f 3683 if (ptype->type == type &&
e3f48d37
JP
3684 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3685 ptype->dev == orig_dev)) {
4ec93edb 3686 if (pt_prev)
f2ccd8fa 3687 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3688 pt_prev = ptype;
3689 }
3690 }
3691
3692 if (pt_prev) {
1080e512 3693 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
0e698bf6 3694 goto drop;
1080e512
MT
3695 else
3696 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4 3697 } else {
b4b9e355 3698drop:
caf586e5 3699 atomic_long_inc(&skb->dev->rx_dropped);
1da177e4
LT
3700 kfree_skb(skb);
3701 /* Jamal, now you will not able to escape explaining
3702 * me how you were going to use this. :-)
3703 */
3704 ret = NET_RX_DROP;
3705 }
3706
b4b9e355 3707unlock:
1da177e4 3708 rcu_read_unlock();
9754e293
DM
3709 return ret;
3710}
3711
3712static int __netif_receive_skb(struct sk_buff *skb)
3713{
3714 int ret;
3715
3716 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3717 unsigned long pflags = current->flags;
3718
3719 /*
3720 * PFMEMALLOC skbs are special, they should
3721 * - be delivered to SOCK_MEMALLOC sockets only
3722 * - stay away from userspace
3723 * - have bounded memory usage
3724 *
3725 * Use PF_MEMALLOC as this saves us from propagating the allocation
3726 * context down to all allocation sites.
3727 */
3728 current->flags |= PF_MEMALLOC;
3729 ret = __netif_receive_skb_core(skb, true);
3730 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3731 } else
3732 ret = __netif_receive_skb_core(skb, false);
3733
1da177e4
LT
3734 return ret;
3735}
0a9627f2 3736
ae78dbfa 3737static int netif_receive_skb_internal(struct sk_buff *skb)
0a9627f2 3738{
588f0330 3739 net_timestamp_check(netdev_tstamp_prequeue, skb);
3b098e2d 3740
c1f19b51
RC
3741 if (skb_defer_rx_timestamp(skb))
3742 return NET_RX_SUCCESS;
3743
df334545 3744#ifdef CONFIG_RPS
c5905afb 3745 if (static_key_false(&rps_needed)) {
3b098e2d
ED
3746 struct rps_dev_flow voidflow, *rflow = &voidflow;
3747 int cpu, ret;
fec5e652 3748
3b098e2d
ED
3749 rcu_read_lock();
3750
3751 cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 3752
3b098e2d
ED
3753 if (cpu >= 0) {
3754 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3755 rcu_read_unlock();
adc9300e 3756 return ret;
3b098e2d 3757 }
adc9300e 3758 rcu_read_unlock();
fec5e652 3759 }
1e94d72f 3760#endif
adc9300e 3761 return __netif_receive_skb(skb);
0a9627f2 3762}
ae78dbfa
BH
3763
3764/**
3765 * netif_receive_skb - process receive buffer from network
3766 * @skb: buffer to process
3767 *
3768 * netif_receive_skb() is the main receive data processing function.
3769 * It always succeeds. The buffer may be dropped during processing
3770 * for congestion control or by the protocol layers.
3771 *
3772 * This function may only be called from softirq context and interrupts
3773 * should be enabled.
3774 *
3775 * Return values (usually ignored):
3776 * NET_RX_SUCCESS: no congestion
3777 * NET_RX_DROP: packet was dropped
3778 */
3779int netif_receive_skb(struct sk_buff *skb)
3780{
3781 trace_netif_receive_skb_entry(skb);
3782
3783 return netif_receive_skb_internal(skb);
3784}
d1b19dff 3785EXPORT_SYMBOL(netif_receive_skb);
1da177e4 3786
88751275
ED
3787/* Network device is going away, flush any packets still pending
3788 * Called with irqs disabled.
3789 */
152102c7 3790static void flush_backlog(void *arg)
6e583ce5 3791{
152102c7 3792 struct net_device *dev = arg;
e36fa2f7 3793 struct softnet_data *sd = &__get_cpu_var(softnet_data);
6e583ce5
SH
3794 struct sk_buff *skb, *tmp;
3795
e36fa2f7 3796 rps_lock(sd);
6e7676c1 3797 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6e583ce5 3798 if (skb->dev == dev) {
e36fa2f7 3799 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 3800 kfree_skb(skb);
76cc8b13 3801 input_queue_head_incr(sd);
6e583ce5 3802 }
6e7676c1 3803 }
e36fa2f7 3804 rps_unlock(sd);
6e7676c1
CG
3805
3806 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3807 if (skb->dev == dev) {
3808 __skb_unlink(skb, &sd->process_queue);
3809 kfree_skb(skb);
76cc8b13 3810 input_queue_head_incr(sd);
6e7676c1
CG
3811 }
3812 }
6e583ce5
SH
3813}
3814
d565b0a1
HX
3815static int napi_gro_complete(struct sk_buff *skb)
3816{
22061d80 3817 struct packet_offload *ptype;
d565b0a1 3818 __be16 type = skb->protocol;
22061d80 3819 struct list_head *head = &offload_base;
d565b0a1
HX
3820 int err = -ENOENT;
3821
c3c7c254
ED
3822 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3823
fc59f9a3
HX
3824 if (NAPI_GRO_CB(skb)->count == 1) {
3825 skb_shinfo(skb)->gso_size = 0;
d565b0a1 3826 goto out;
fc59f9a3 3827 }
d565b0a1
HX
3828
3829 rcu_read_lock();
3830 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 3831 if (ptype->type != type || !ptype->callbacks.gro_complete)
d565b0a1
HX
3832 continue;
3833
299603e8 3834 err = ptype->callbacks.gro_complete(skb, 0);
d565b0a1
HX
3835 break;
3836 }
3837 rcu_read_unlock();
3838
3839 if (err) {
3840 WARN_ON(&ptype->list == head);
3841 kfree_skb(skb);
3842 return NET_RX_SUCCESS;
3843 }
3844
3845out:
ae78dbfa 3846 return netif_receive_skb_internal(skb);
d565b0a1
HX
3847}
3848
2e71a6f8
ED
3849/* napi->gro_list contains packets ordered by age.
3850 * youngest packets at the head of it.
3851 * Complete skbs in reverse order to reduce latencies.
3852 */
3853void napi_gro_flush(struct napi_struct *napi, bool flush_old)
d565b0a1 3854{
2e71a6f8 3855 struct sk_buff *skb, *prev = NULL;
d565b0a1 3856
2e71a6f8
ED
3857 /* scan list and build reverse chain */
3858 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3859 skb->prev = prev;
3860 prev = skb;
3861 }
3862
3863 for (skb = prev; skb; skb = prev) {
d565b0a1 3864 skb->next = NULL;
2e71a6f8
ED
3865
3866 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3867 return;
3868
3869 prev = skb->prev;
d565b0a1 3870 napi_gro_complete(skb);
2e71a6f8 3871 napi->gro_count--;
d565b0a1
HX
3872 }
3873
3874 napi->gro_list = NULL;
3875}
86cac58b 3876EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 3877
89c5fa33
ED
3878static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3879{
3880 struct sk_buff *p;
3881 unsigned int maclen = skb->dev->hard_header_len;
0b4cec8c 3882 u32 hash = skb_get_hash_raw(skb);
89c5fa33
ED
3883
3884 for (p = napi->gro_list; p; p = p->next) {
3885 unsigned long diffs;
3886
0b4cec8c
TH
3887 NAPI_GRO_CB(p)->flush = 0;
3888
3889 if (hash != skb_get_hash_raw(p)) {
3890 NAPI_GRO_CB(p)->same_flow = 0;
3891 continue;
3892 }
3893
89c5fa33
ED
3894 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3895 diffs |= p->vlan_tci ^ skb->vlan_tci;
3896 if (maclen == ETH_HLEN)
3897 diffs |= compare_ether_header(skb_mac_header(p),
a50e233c 3898 skb_mac_header(skb));
89c5fa33
ED
3899 else if (!diffs)
3900 diffs = memcmp(skb_mac_header(p),
a50e233c 3901 skb_mac_header(skb),
89c5fa33
ED
3902 maclen);
3903 NAPI_GRO_CB(p)->same_flow = !diffs;
89c5fa33
ED
3904 }
3905}
3906
299603e8
JC
3907static void skb_gro_reset_offset(struct sk_buff *skb)
3908{
3909 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3910 const skb_frag_t *frag0 = &pinfo->frags[0];
3911
3912 NAPI_GRO_CB(skb)->data_offset = 0;
3913 NAPI_GRO_CB(skb)->frag0 = NULL;
3914 NAPI_GRO_CB(skb)->frag0_len = 0;
3915
3916 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3917 pinfo->nr_frags &&
3918 !PageHighMem(skb_frag_page(frag0))) {
3919 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3920 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
89c5fa33
ED
3921 }
3922}
3923
a50e233c
ED
3924static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3925{
3926 struct skb_shared_info *pinfo = skb_shinfo(skb);
3927
3928 BUG_ON(skb->end - skb->tail < grow);
3929
3930 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3931
3932 skb->data_len -= grow;
3933 skb->tail += grow;
3934
3935 pinfo->frags[0].page_offset += grow;
3936 skb_frag_size_sub(&pinfo->frags[0], grow);
3937
3938 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3939 skb_frag_unref(skb, 0);
3940 memmove(pinfo->frags, pinfo->frags + 1,
3941 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3942 }
3943}
3944
bb728820 3945static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
3946{
3947 struct sk_buff **pp = NULL;
22061d80 3948 struct packet_offload *ptype;
d565b0a1 3949 __be16 type = skb->protocol;
22061d80 3950 struct list_head *head = &offload_base;
0da2afd5 3951 int same_flow;
5b252f0c 3952 enum gro_result ret;
a50e233c 3953 int grow;
d565b0a1 3954
9c62a68d 3955 if (!(skb->dev->features & NETIF_F_GRO))
d565b0a1
HX
3956 goto normal;
3957
21dc3301 3958 if (skb_is_gso(skb) || skb_has_frag_list(skb))
f17f5c91
HX
3959 goto normal;
3960
89c5fa33 3961 gro_list_prepare(napi, skb);
bf5a755f 3962 NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
89c5fa33 3963
d565b0a1
HX
3964 rcu_read_lock();
3965 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 3966 if (ptype->type != type || !ptype->callbacks.gro_receive)
d565b0a1
HX
3967 continue;
3968
86911732 3969 skb_set_network_header(skb, skb_gro_offset(skb));
efd9450e 3970 skb_reset_mac_len(skb);
d565b0a1
HX
3971 NAPI_GRO_CB(skb)->same_flow = 0;
3972 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 3973 NAPI_GRO_CB(skb)->free = 0;
b582ef09 3974 NAPI_GRO_CB(skb)->udp_mark = 0;
d565b0a1 3975
f191a1d1 3976 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
d565b0a1
HX
3977 break;
3978 }
3979 rcu_read_unlock();
3980
3981 if (&ptype->list == head)
3982 goto normal;
3983
0da2afd5 3984 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 3985 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 3986
d565b0a1
HX
3987 if (pp) {
3988 struct sk_buff *nskb = *pp;
3989
3990 *pp = nskb->next;
3991 nskb->next = NULL;
3992 napi_gro_complete(nskb);
4ae5544f 3993 napi->gro_count--;
d565b0a1
HX
3994 }
3995
0da2afd5 3996 if (same_flow)
d565b0a1
HX
3997 goto ok;
3998
600adc18 3999 if (NAPI_GRO_CB(skb)->flush)
d565b0a1 4000 goto normal;
d565b0a1 4001
600adc18
ED
4002 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4003 struct sk_buff *nskb = napi->gro_list;
4004
4005 /* locate the end of the list to select the 'oldest' flow */
4006 while (nskb->next) {
4007 pp = &nskb->next;
4008 nskb = *pp;
4009 }
4010 *pp = NULL;
4011 nskb->next = NULL;
4012 napi_gro_complete(nskb);
4013 } else {
4014 napi->gro_count++;
4015 }
d565b0a1 4016 NAPI_GRO_CB(skb)->count = 1;
2e71a6f8 4017 NAPI_GRO_CB(skb)->age = jiffies;
29e98242 4018 NAPI_GRO_CB(skb)->last = skb;
86911732 4019 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
4020 skb->next = napi->gro_list;
4021 napi->gro_list = skb;
5d0d9be8 4022 ret = GRO_HELD;
d565b0a1 4023
ad0f9904 4024pull:
a50e233c
ED
4025 grow = skb_gro_offset(skb) - skb_headlen(skb);
4026 if (grow > 0)
4027 gro_pull_from_frag0(skb, grow);
d565b0a1 4028ok:
5d0d9be8 4029 return ret;
d565b0a1
HX
4030
4031normal:
ad0f9904
HX
4032 ret = GRO_NORMAL;
4033 goto pull;
5d38a079 4034}
96e93eab 4035
bf5a755f
JC
4036struct packet_offload *gro_find_receive_by_type(__be16 type)
4037{
4038 struct list_head *offload_head = &offload_base;
4039 struct packet_offload *ptype;
4040
4041 list_for_each_entry_rcu(ptype, offload_head, list) {
4042 if (ptype->type != type || !ptype->callbacks.gro_receive)
4043 continue;
4044 return ptype;
4045 }
4046 return NULL;
4047}
e27a2f83 4048EXPORT_SYMBOL(gro_find_receive_by_type);
bf5a755f
JC
4049
4050struct packet_offload *gro_find_complete_by_type(__be16 type)
4051{
4052 struct list_head *offload_head = &offload_base;
4053 struct packet_offload *ptype;
4054
4055 list_for_each_entry_rcu(ptype, offload_head, list) {
4056 if (ptype->type != type || !ptype->callbacks.gro_complete)
4057 continue;
4058 return ptype;
4059 }
4060 return NULL;
4061}
e27a2f83 4062EXPORT_SYMBOL(gro_find_complete_by_type);
5d38a079 4063
bb728820 4064static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 4065{
5d0d9be8
HX
4066 switch (ret) {
4067 case GRO_NORMAL:
ae78dbfa 4068 if (netif_receive_skb_internal(skb))
c7c4b3b6
BH
4069 ret = GRO_DROP;
4070 break;
5d38a079 4071
5d0d9be8 4072 case GRO_DROP:
5d38a079
HX
4073 kfree_skb(skb);
4074 break;
5b252f0c 4075
daa86548 4076 case GRO_MERGED_FREE:
d7e8883c
ED
4077 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4078 kmem_cache_free(skbuff_head_cache, skb);
4079 else
4080 __kfree_skb(skb);
daa86548
ED
4081 break;
4082
5b252f0c
BH
4083 case GRO_HELD:
4084 case GRO_MERGED:
4085 break;
5d38a079
HX
4086 }
4087
c7c4b3b6 4088 return ret;
5d0d9be8 4089}
5d0d9be8 4090
c7c4b3b6 4091gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 4092{
ae78dbfa 4093 trace_napi_gro_receive_entry(skb);
86911732 4094
a50e233c
ED
4095 skb_gro_reset_offset(skb);
4096
89c5fa33 4097 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
d565b0a1
HX
4098}
4099EXPORT_SYMBOL(napi_gro_receive);
4100
d0c2b0d2 4101static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
96e93eab 4102{
96e93eab 4103 __skb_pull(skb, skb_headlen(skb));
2a2a459e
ED
4104 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4105 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3701e513 4106 skb->vlan_tci = 0;
66c46d74 4107 skb->dev = napi->dev;
6d152e23 4108 skb->skb_iif = 0;
c3caf119
JC
4109 skb->encapsulation = 0;
4110 skb_shinfo(skb)->gso_type = 0;
e33d0ba8 4111 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
96e93eab
HX
4112
4113 napi->skb = skb;
4114}
96e93eab 4115
76620aaf 4116struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 4117{
5d38a079 4118 struct sk_buff *skb = napi->skb;
5d38a079
HX
4119
4120 if (!skb) {
89d71a66 4121 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
84b9cd63 4122 napi->skb = skb;
80595d59 4123 }
96e93eab
HX
4124 return skb;
4125}
76620aaf 4126EXPORT_SYMBOL(napi_get_frags);
96e93eab 4127
a50e233c
ED
4128static gro_result_t napi_frags_finish(struct napi_struct *napi,
4129 struct sk_buff *skb,
4130 gro_result_t ret)
96e93eab 4131{
5d0d9be8
HX
4132 switch (ret) {
4133 case GRO_NORMAL:
a50e233c
ED
4134 case GRO_HELD:
4135 __skb_push(skb, ETH_HLEN);
4136 skb->protocol = eth_type_trans(skb, skb->dev);
4137 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
c7c4b3b6 4138 ret = GRO_DROP;
86911732 4139 break;
5d38a079 4140
5d0d9be8 4141 case GRO_DROP:
5d0d9be8
HX
4142 case GRO_MERGED_FREE:
4143 napi_reuse_skb(napi, skb);
4144 break;
5b252f0c
BH
4145
4146 case GRO_MERGED:
4147 break;
5d0d9be8 4148 }
5d38a079 4149
c7c4b3b6 4150 return ret;
5d38a079 4151}
5d0d9be8 4152
a50e233c
ED
4153/* Upper GRO stack assumes network header starts at gro_offset=0
4154 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4155 * We copy ethernet header into skb->data to have a common layout.
4156 */
4adb9c4a 4157static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
76620aaf
HX
4158{
4159 struct sk_buff *skb = napi->skb;
a50e233c
ED
4160 const struct ethhdr *eth;
4161 unsigned int hlen = sizeof(*eth);
76620aaf
HX
4162
4163 napi->skb = NULL;
4164
a50e233c
ED
4165 skb_reset_mac_header(skb);
4166 skb_gro_reset_offset(skb);
4167
4168 eth = skb_gro_header_fast(skb, 0);
4169 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4170 eth = skb_gro_header_slow(skb, hlen, 0);
4171 if (unlikely(!eth)) {
4172 napi_reuse_skb(napi, skb);
4173 return NULL;
4174 }
4175 } else {
4176 gro_pull_from_frag0(skb, hlen);
4177 NAPI_GRO_CB(skb)->frag0 += hlen;
4178 NAPI_GRO_CB(skb)->frag0_len -= hlen;
76620aaf 4179 }
a50e233c
ED
4180 __skb_pull(skb, hlen);
4181
4182 /*
4183 * This works because the only protocols we care about don't require
4184 * special handling.
4185 * We'll fix it up properly in napi_frags_finish()
4186 */
4187 skb->protocol = eth->h_proto;
76620aaf 4188
76620aaf
HX
4189 return skb;
4190}
76620aaf 4191
c7c4b3b6 4192gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 4193{
76620aaf 4194 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
4195
4196 if (!skb)
c7c4b3b6 4197 return GRO_DROP;
5d0d9be8 4198
ae78dbfa
BH
4199 trace_napi_gro_frags_entry(skb);
4200
89c5fa33 4201 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5d0d9be8 4202}
5d38a079
HX
4203EXPORT_SYMBOL(napi_gro_frags);
4204
e326bed2 4205/*
855abcf0 4206 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
e326bed2
ED
4207 * Note: called with local irq disabled, but exits with local irq enabled.
4208 */
4209static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4210{
4211#ifdef CONFIG_RPS
4212 struct softnet_data *remsd = sd->rps_ipi_list;
4213
4214 if (remsd) {
4215 sd->rps_ipi_list = NULL;
4216
4217 local_irq_enable();
4218
4219 /* Send pending IPI's to kick RPS processing on remote cpus. */
4220 while (remsd) {
4221 struct softnet_data *next = remsd->rps_ipi_next;
4222
4223 if (cpu_online(remsd->cpu))
c46fff2a 4224 smp_call_function_single_async(remsd->cpu,
fce8ad15 4225 &remsd->csd);
e326bed2
ED
4226 remsd = next;
4227 }
4228 } else
4229#endif
4230 local_irq_enable();
4231}
4232
bea3348e 4233static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
4234{
4235 int work = 0;
eecfd7c4 4236 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
1da177e4 4237
e326bed2
ED
4238#ifdef CONFIG_RPS
4239 /* Check if we have pending ipi, its better to send them now,
4240 * not waiting net_rx_action() end.
4241 */
4242 if (sd->rps_ipi_list) {
4243 local_irq_disable();
4244 net_rps_action_and_irq_enable(sd);
4245 }
4246#endif
bea3348e 4247 napi->weight = weight_p;
6e7676c1 4248 local_irq_disable();
11ef7a89 4249 while (1) {
1da177e4 4250 struct sk_buff *skb;
6e7676c1
CG
4251
4252 while ((skb = __skb_dequeue(&sd->process_queue))) {
4253 local_irq_enable();
4254 __netif_receive_skb(skb);
6e7676c1 4255 local_irq_disable();
76cc8b13
TH
4256 input_queue_head_incr(sd);
4257 if (++work >= quota) {
4258 local_irq_enable();
4259 return work;
4260 }
6e7676c1 4261 }
1da177e4 4262
e36fa2f7 4263 rps_lock(sd);
11ef7a89 4264 if (skb_queue_empty(&sd->input_pkt_queue)) {
eecfd7c4
ED
4265 /*
4266 * Inline a custom version of __napi_complete().
4267 * only current cpu owns and manipulates this napi,
11ef7a89
TH
4268 * and NAPI_STATE_SCHED is the only possible flag set
4269 * on backlog.
4270 * We can use a plain write instead of clear_bit(),
eecfd7c4
ED
4271 * and we dont need an smp_mb() memory barrier.
4272 */
4273 list_del(&napi->poll_list);
4274 napi->state = 0;
11ef7a89 4275 rps_unlock(sd);
eecfd7c4 4276
11ef7a89 4277 break;
bea3348e 4278 }
11ef7a89
TH
4279
4280 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4281 &sd->process_queue);
e36fa2f7 4282 rps_unlock(sd);
6e7676c1
CG
4283 }
4284 local_irq_enable();
1da177e4 4285
bea3348e
SH
4286 return work;
4287}
1da177e4 4288
bea3348e
SH
4289/**
4290 * __napi_schedule - schedule for receive
c4ea43c5 4291 * @n: entry to schedule
bea3348e
SH
4292 *
4293 * The entry's receive function will be scheduled to run
4294 */
b5606c2d 4295void __napi_schedule(struct napi_struct *n)
bea3348e
SH
4296{
4297 unsigned long flags;
1da177e4 4298
bea3348e 4299 local_irq_save(flags);
eecfd7c4 4300 ____napi_schedule(&__get_cpu_var(softnet_data), n);
bea3348e 4301 local_irq_restore(flags);
1da177e4 4302}
bea3348e
SH
4303EXPORT_SYMBOL(__napi_schedule);
4304
d565b0a1
HX
4305void __napi_complete(struct napi_struct *n)
4306{
4307 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4308 BUG_ON(n->gro_list);
4309
4310 list_del(&n->poll_list);
4e857c58 4311 smp_mb__before_atomic();
d565b0a1
HX
4312 clear_bit(NAPI_STATE_SCHED, &n->state);
4313}
4314EXPORT_SYMBOL(__napi_complete);
4315
4316void napi_complete(struct napi_struct *n)
4317{
4318 unsigned long flags;
4319
4320 /*
4321 * don't let napi dequeue from the cpu poll list
4322 * just in case its running on a different cpu
4323 */
4324 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4325 return;
4326
2e71a6f8 4327 napi_gro_flush(n, false);
d565b0a1
HX
4328 local_irq_save(flags);
4329 __napi_complete(n);
4330 local_irq_restore(flags);
4331}
4332EXPORT_SYMBOL(napi_complete);
4333
af12fa6e
ET
4334/* must be called under rcu_read_lock(), as we dont take a reference */
4335struct napi_struct *napi_by_id(unsigned int napi_id)
4336{
4337 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4338 struct napi_struct *napi;
4339
4340 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4341 if (napi->napi_id == napi_id)
4342 return napi;
4343
4344 return NULL;
4345}
4346EXPORT_SYMBOL_GPL(napi_by_id);
4347
4348void napi_hash_add(struct napi_struct *napi)
4349{
4350 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4351
4352 spin_lock(&napi_hash_lock);
4353
4354 /* 0 is not a valid id, we also skip an id that is taken
4355 * we expect both events to be extremely rare
4356 */
4357 napi->napi_id = 0;
4358 while (!napi->napi_id) {
4359 napi->napi_id = ++napi_gen_id;
4360 if (napi_by_id(napi->napi_id))
4361 napi->napi_id = 0;
4362 }
4363
4364 hlist_add_head_rcu(&napi->napi_hash_node,
4365 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4366
4367 spin_unlock(&napi_hash_lock);
4368 }
4369}
4370EXPORT_SYMBOL_GPL(napi_hash_add);
4371
4372/* Warning : caller is responsible to make sure rcu grace period
4373 * is respected before freeing memory containing @napi
4374 */
4375void napi_hash_del(struct napi_struct *napi)
4376{
4377 spin_lock(&napi_hash_lock);
4378
4379 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4380 hlist_del_rcu(&napi->napi_hash_node);
4381
4382 spin_unlock(&napi_hash_lock);
4383}
4384EXPORT_SYMBOL_GPL(napi_hash_del);
4385
d565b0a1
HX
4386void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4387 int (*poll)(struct napi_struct *, int), int weight)
4388{
4389 INIT_LIST_HEAD(&napi->poll_list);
4ae5544f 4390 napi->gro_count = 0;
d565b0a1 4391 napi->gro_list = NULL;
5d38a079 4392 napi->skb = NULL;
d565b0a1 4393 napi->poll = poll;
82dc3c63
ED
4394 if (weight > NAPI_POLL_WEIGHT)
4395 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4396 weight, dev->name);
d565b0a1
HX
4397 napi->weight = weight;
4398 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 4399 napi->dev = dev;
5d38a079 4400#ifdef CONFIG_NETPOLL
d565b0a1
HX
4401 spin_lock_init(&napi->poll_lock);
4402 napi->poll_owner = -1;
4403#endif
4404 set_bit(NAPI_STATE_SCHED, &napi->state);
4405}
4406EXPORT_SYMBOL(netif_napi_add);
4407
4408void netif_napi_del(struct napi_struct *napi)
4409{
d7b06636 4410 list_del_init(&napi->dev_list);
76620aaf 4411 napi_free_frags(napi);
d565b0a1 4412
289dccbe 4413 kfree_skb_list(napi->gro_list);
d565b0a1 4414 napi->gro_list = NULL;
4ae5544f 4415 napi->gro_count = 0;
d565b0a1
HX
4416}
4417EXPORT_SYMBOL(netif_napi_del);
4418
1da177e4
LT
4419static void net_rx_action(struct softirq_action *h)
4420{
e326bed2 4421 struct softnet_data *sd = &__get_cpu_var(softnet_data);
24f8b238 4422 unsigned long time_limit = jiffies + 2;
51b0bded 4423 int budget = netdev_budget;
53fb95d3
MM
4424 void *have;
4425
1da177e4
LT
4426 local_irq_disable();
4427
e326bed2 4428 while (!list_empty(&sd->poll_list)) {
bea3348e
SH
4429 struct napi_struct *n;
4430 int work, weight;
1da177e4 4431
bea3348e 4432 /* If softirq window is exhuasted then punt.
24f8b238
SH
4433 * Allow this to run for 2 jiffies since which will allow
4434 * an average latency of 1.5/HZ.
bea3348e 4435 */
d1f41b67 4436 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
1da177e4
LT
4437 goto softnet_break;
4438
4439 local_irq_enable();
4440
bea3348e
SH
4441 /* Even though interrupts have been re-enabled, this
4442 * access is safe because interrupts can only add new
4443 * entries to the tail of this list, and only ->poll()
4444 * calls can remove this head entry from the list.
4445 */
e326bed2 4446 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
1da177e4 4447
bea3348e
SH
4448 have = netpoll_poll_lock(n);
4449
4450 weight = n->weight;
4451
0a7606c1
DM
4452 /* This NAPI_STATE_SCHED test is for avoiding a race
4453 * with netpoll's poll_napi(). Only the entity which
4454 * obtains the lock and sees NAPI_STATE_SCHED set will
4455 * actually make the ->poll() call. Therefore we avoid
25985edc 4456 * accidentally calling ->poll() when NAPI is not scheduled.
0a7606c1
DM
4457 */
4458 work = 0;
4ea7e386 4459 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
0a7606c1 4460 work = n->poll(n, weight);
4ea7e386
NH
4461 trace_napi_poll(n);
4462 }
bea3348e
SH
4463
4464 WARN_ON_ONCE(work > weight);
4465
4466 budget -= work;
4467
4468 local_irq_disable();
4469
4470 /* Drivers must not modify the NAPI state if they
4471 * consume the entire weight. In such cases this code
4472 * still "owns" the NAPI instance and therefore can
4473 * move the instance around on the list at-will.
4474 */
fed17f30 4475 if (unlikely(work == weight)) {
ff780cd8
HX
4476 if (unlikely(napi_disable_pending(n))) {
4477 local_irq_enable();
4478 napi_complete(n);
4479 local_irq_disable();
2e71a6f8
ED
4480 } else {
4481 if (n->gro_list) {
4482 /* flush too old packets
4483 * If HZ < 1000, flush all packets.
4484 */
4485 local_irq_enable();
4486 napi_gro_flush(n, HZ >= 1000);
4487 local_irq_disable();
4488 }
e326bed2 4489 list_move_tail(&n->poll_list, &sd->poll_list);
2e71a6f8 4490 }
fed17f30 4491 }
bea3348e
SH
4492
4493 netpoll_poll_unlock(have);
1da177e4
LT
4494 }
4495out:
e326bed2 4496 net_rps_action_and_irq_enable(sd);
0a9627f2 4497
db217334
CL
4498#ifdef CONFIG_NET_DMA
4499 /*
4500 * There may not be any more sk_buffs coming right now, so push
4501 * any pending DMA copies to hardware
4502 */
2ba05622 4503 dma_issue_pending_all();
db217334 4504#endif
bea3348e 4505
1da177e4
LT
4506 return;
4507
4508softnet_break:
dee42870 4509 sd->time_squeeze++;
1da177e4
LT
4510 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4511 goto out;
4512}
4513
aa9d8560 4514struct netdev_adjacent {
9ff162a8 4515 struct net_device *dev;
5d261913
VF
4516
4517 /* upper master flag, there can only be one master device per list */
9ff162a8 4518 bool master;
5d261913 4519
5d261913
VF
4520 /* counter for the number of times this device was added to us */
4521 u16 ref_nr;
4522
402dae96
VF
4523 /* private field for the users */
4524 void *private;
4525
9ff162a8
JP
4526 struct list_head list;
4527 struct rcu_head rcu;
9ff162a8
JP
4528};
4529
5d261913
VF
4530static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4531 struct net_device *adj_dev,
2f268f12 4532 struct list_head *adj_list)
9ff162a8 4533{
5d261913 4534 struct netdev_adjacent *adj;
5d261913 4535
2f268f12 4536 list_for_each_entry(adj, adj_list, list) {
5d261913
VF
4537 if (adj->dev == adj_dev)
4538 return adj;
9ff162a8
JP
4539 }
4540 return NULL;
4541}
4542
4543/**
4544 * netdev_has_upper_dev - Check if device is linked to an upper device
4545 * @dev: device
4546 * @upper_dev: upper device to check
4547 *
4548 * Find out if a device is linked to specified upper device and return true
4549 * in case it is. Note that this checks only immediate upper device,
4550 * not through a complete stack of devices. The caller must hold the RTNL lock.
4551 */
4552bool netdev_has_upper_dev(struct net_device *dev,
4553 struct net_device *upper_dev)
4554{
4555 ASSERT_RTNL();
4556
2f268f12 4557 return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
9ff162a8
JP
4558}
4559EXPORT_SYMBOL(netdev_has_upper_dev);
4560
4561/**
4562 * netdev_has_any_upper_dev - Check if device is linked to some device
4563 * @dev: device
4564 *
4565 * Find out if a device is linked to an upper device and return true in case
4566 * it is. The caller must hold the RTNL lock.
4567 */
1d143d9f 4568static bool netdev_has_any_upper_dev(struct net_device *dev)
9ff162a8
JP
4569{
4570 ASSERT_RTNL();
4571
2f268f12 4572 return !list_empty(&dev->all_adj_list.upper);
9ff162a8 4573}
9ff162a8
JP
4574
4575/**
4576 * netdev_master_upper_dev_get - Get master upper device
4577 * @dev: device
4578 *
4579 * Find a master upper device and return pointer to it or NULL in case
4580 * it's not there. The caller must hold the RTNL lock.
4581 */
4582struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4583{
aa9d8560 4584 struct netdev_adjacent *upper;
9ff162a8
JP
4585
4586 ASSERT_RTNL();
4587
2f268f12 4588 if (list_empty(&dev->adj_list.upper))
9ff162a8
JP
4589 return NULL;
4590
2f268f12 4591 upper = list_first_entry(&dev->adj_list.upper,
aa9d8560 4592 struct netdev_adjacent, list);
9ff162a8
JP
4593 if (likely(upper->master))
4594 return upper->dev;
4595 return NULL;
4596}
4597EXPORT_SYMBOL(netdev_master_upper_dev_get);
4598
b6ccba4c
VF
4599void *netdev_adjacent_get_private(struct list_head *adj_list)
4600{
4601 struct netdev_adjacent *adj;
4602
4603 adj = list_entry(adj_list, struct netdev_adjacent, list);
4604
4605 return adj->private;
4606}
4607EXPORT_SYMBOL(netdev_adjacent_get_private);
4608
44a40855
VY
4609/**
4610 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4611 * @dev: device
4612 * @iter: list_head ** of the current position
4613 *
4614 * Gets the next device from the dev's upper list, starting from iter
4615 * position. The caller must hold RCU read lock.
4616 */
4617struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4618 struct list_head **iter)
4619{
4620 struct netdev_adjacent *upper;
4621
4622 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4623
4624 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4625
4626 if (&upper->list == &dev->adj_list.upper)
4627 return NULL;
4628
4629 *iter = &upper->list;
4630
4631 return upper->dev;
4632}
4633EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4634
31088a11
VF
4635/**
4636 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
48311f46
VF
4637 * @dev: device
4638 * @iter: list_head ** of the current position
4639 *
4640 * Gets the next device from the dev's upper list, starting from iter
4641 * position. The caller must hold RCU read lock.
4642 */
2f268f12
VF
4643struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4644 struct list_head **iter)
48311f46
VF
4645{
4646 struct netdev_adjacent *upper;
4647
85328240 4648 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
48311f46
VF
4649
4650 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4651
2f268f12 4652 if (&upper->list == &dev->all_adj_list.upper)
48311f46
VF
4653 return NULL;
4654
4655 *iter = &upper->list;
4656
4657 return upper->dev;
4658}
2f268f12 4659EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
48311f46 4660
31088a11
VF
4661/**
4662 * netdev_lower_get_next_private - Get the next ->private from the
4663 * lower neighbour list
4664 * @dev: device
4665 * @iter: list_head ** of the current position
4666 *
4667 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4668 * list, starting from iter position. The caller must hold either hold the
4669 * RTNL lock or its own locking that guarantees that the neighbour lower
4670 * list will remain unchainged.
4671 */
4672void *netdev_lower_get_next_private(struct net_device *dev,
4673 struct list_head **iter)
4674{
4675 struct netdev_adjacent *lower;
4676
4677 lower = list_entry(*iter, struct netdev_adjacent, list);
4678
4679 if (&lower->list == &dev->adj_list.lower)
4680 return NULL;
4681
6859e7df 4682 *iter = lower->list.next;
31088a11
VF
4683
4684 return lower->private;
4685}
4686EXPORT_SYMBOL(netdev_lower_get_next_private);
4687
4688/**
4689 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4690 * lower neighbour list, RCU
4691 * variant
4692 * @dev: device
4693 * @iter: list_head ** of the current position
4694 *
4695 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4696 * list, starting from iter position. The caller must hold RCU read lock.
4697 */
4698void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4699 struct list_head **iter)
4700{
4701 struct netdev_adjacent *lower;
4702
4703 WARN_ON_ONCE(!rcu_read_lock_held());
4704
4705 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4706
4707 if (&lower->list == &dev->adj_list.lower)
4708 return NULL;
4709
6859e7df 4710 *iter = &lower->list;
31088a11
VF
4711
4712 return lower->private;
4713}
4714EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4715
4085ebe8
VY
4716/**
4717 * netdev_lower_get_next - Get the next device from the lower neighbour
4718 * list
4719 * @dev: device
4720 * @iter: list_head ** of the current position
4721 *
4722 * Gets the next netdev_adjacent from the dev's lower neighbour
4723 * list, starting from iter position. The caller must hold RTNL lock or
4724 * its own locking that guarantees that the neighbour lower
4725 * list will remain unchainged.
4726 */
4727void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4728{
4729 struct netdev_adjacent *lower;
4730
4731 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4732
4733 if (&lower->list == &dev->adj_list.lower)
4734 return NULL;
4735
4736 *iter = &lower->list;
4737
4738 return lower->dev;
4739}
4740EXPORT_SYMBOL(netdev_lower_get_next);
4741
e001bfad 4742/**
4743 * netdev_lower_get_first_private_rcu - Get the first ->private from the
4744 * lower neighbour list, RCU
4745 * variant
4746 * @dev: device
4747 *
4748 * Gets the first netdev_adjacent->private from the dev's lower neighbour
4749 * list. The caller must hold RCU read lock.
4750 */
4751void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4752{
4753 struct netdev_adjacent *lower;
4754
4755 lower = list_first_or_null_rcu(&dev->adj_list.lower,
4756 struct netdev_adjacent, list);
4757 if (lower)
4758 return lower->private;
4759 return NULL;
4760}
4761EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4762
9ff162a8
JP
4763/**
4764 * netdev_master_upper_dev_get_rcu - Get master upper device
4765 * @dev: device
4766 *
4767 * Find a master upper device and return pointer to it or NULL in case
4768 * it's not there. The caller must hold the RCU read lock.
4769 */
4770struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4771{
aa9d8560 4772 struct netdev_adjacent *upper;
9ff162a8 4773
2f268f12 4774 upper = list_first_or_null_rcu(&dev->adj_list.upper,
aa9d8560 4775 struct netdev_adjacent, list);
9ff162a8
JP
4776 if (upper && likely(upper->master))
4777 return upper->dev;
4778 return NULL;
4779}
4780EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4781
0a59f3a9 4782static int netdev_adjacent_sysfs_add(struct net_device *dev,
3ee32707
VF
4783 struct net_device *adj_dev,
4784 struct list_head *dev_list)
4785{
4786 char linkname[IFNAMSIZ+7];
4787 sprintf(linkname, dev_list == &dev->adj_list.upper ?
4788 "upper_%s" : "lower_%s", adj_dev->name);
4789 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4790 linkname);
4791}
0a59f3a9 4792static void netdev_adjacent_sysfs_del(struct net_device *dev,
3ee32707
VF
4793 char *name,
4794 struct list_head *dev_list)
4795{
4796 char linkname[IFNAMSIZ+7];
4797 sprintf(linkname, dev_list == &dev->adj_list.upper ?
4798 "upper_%s" : "lower_%s", name);
4799 sysfs_remove_link(&(dev->dev.kobj), linkname);
4800}
4801
4802#define netdev_adjacent_is_neigh_list(dev, dev_list) \
4803 (dev_list == &dev->adj_list.upper || \
4804 dev_list == &dev->adj_list.lower)
4805
5d261913
VF
4806static int __netdev_adjacent_dev_insert(struct net_device *dev,
4807 struct net_device *adj_dev,
7863c054 4808 struct list_head *dev_list,
402dae96 4809 void *private, bool master)
5d261913
VF
4810{
4811 struct netdev_adjacent *adj;
842d67a7 4812 int ret;
5d261913 4813
7863c054 4814 adj = __netdev_find_adj(dev, adj_dev, dev_list);
5d261913
VF
4815
4816 if (adj) {
5d261913
VF
4817 adj->ref_nr++;
4818 return 0;
4819 }
4820
4821 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4822 if (!adj)
4823 return -ENOMEM;
4824
4825 adj->dev = adj_dev;
4826 adj->master = master;
5d261913 4827 adj->ref_nr = 1;
402dae96 4828 adj->private = private;
5d261913 4829 dev_hold(adj_dev);
2f268f12
VF
4830
4831 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4832 adj_dev->name, dev->name, adj_dev->name);
5d261913 4833
3ee32707
VF
4834 if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
4835 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5831d66e
VF
4836 if (ret)
4837 goto free_adj;
4838 }
4839
7863c054 4840 /* Ensure that master link is always the first item in list. */
842d67a7
VF
4841 if (master) {
4842 ret = sysfs_create_link(&(dev->dev.kobj),
4843 &(adj_dev->dev.kobj), "master");
4844 if (ret)
5831d66e 4845 goto remove_symlinks;
842d67a7 4846
7863c054 4847 list_add_rcu(&adj->list, dev_list);
842d67a7 4848 } else {
7863c054 4849 list_add_tail_rcu(&adj->list, dev_list);
842d67a7 4850 }
5d261913
VF
4851
4852 return 0;
842d67a7 4853
5831d66e 4854remove_symlinks:
3ee32707
VF
4855 if (netdev_adjacent_is_neigh_list(dev, dev_list))
4856 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
842d67a7
VF
4857free_adj:
4858 kfree(adj);
974daef7 4859 dev_put(adj_dev);
842d67a7
VF
4860
4861 return ret;
5d261913
VF
4862}
4863
1d143d9f 4864static void __netdev_adjacent_dev_remove(struct net_device *dev,
4865 struct net_device *adj_dev,
4866 struct list_head *dev_list)
5d261913
VF
4867{
4868 struct netdev_adjacent *adj;
4869
7863c054 4870 adj = __netdev_find_adj(dev, adj_dev, dev_list);
5d261913 4871
2f268f12
VF
4872 if (!adj) {
4873 pr_err("tried to remove device %s from %s\n",
4874 dev->name, adj_dev->name);
5d261913 4875 BUG();
2f268f12 4876 }
5d261913
VF
4877
4878 if (adj->ref_nr > 1) {
2f268f12
VF
4879 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4880 adj->ref_nr-1);
5d261913
VF
4881 adj->ref_nr--;
4882 return;
4883 }
4884
842d67a7
VF
4885 if (adj->master)
4886 sysfs_remove_link(&(dev->dev.kobj), "master");
4887
3ee32707
VF
4888 if (netdev_adjacent_is_neigh_list(dev, dev_list))
4889 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5831d66e 4890
5d261913 4891 list_del_rcu(&adj->list);
2f268f12
VF
4892 pr_debug("dev_put for %s, because link removed from %s to %s\n",
4893 adj_dev->name, dev->name, adj_dev->name);
5d261913
VF
4894 dev_put(adj_dev);
4895 kfree_rcu(adj, rcu);
4896}
4897
1d143d9f 4898static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4899 struct net_device *upper_dev,
4900 struct list_head *up_list,
4901 struct list_head *down_list,
4902 void *private, bool master)
5d261913
VF
4903{
4904 int ret;
4905
402dae96
VF
4906 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4907 master);
5d261913
VF
4908 if (ret)
4909 return ret;
4910
402dae96
VF
4911 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4912 false);
5d261913 4913 if (ret) {
2f268f12 4914 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5d261913
VF
4915 return ret;
4916 }
4917
4918 return 0;
4919}
4920
1d143d9f 4921static int __netdev_adjacent_dev_link(struct net_device *dev,
4922 struct net_device *upper_dev)
5d261913 4923{
2f268f12
VF
4924 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4925 &dev->all_adj_list.upper,
4926 &upper_dev->all_adj_list.lower,
402dae96 4927 NULL, false);
5d261913
VF
4928}
4929
1d143d9f 4930static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4931 struct net_device *upper_dev,
4932 struct list_head *up_list,
4933 struct list_head *down_list)
5d261913 4934{
2f268f12
VF
4935 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4936 __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5d261913
VF
4937}
4938
1d143d9f 4939static void __netdev_adjacent_dev_unlink(struct net_device *dev,
4940 struct net_device *upper_dev)
5d261913 4941{
2f268f12
VF
4942 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4943 &dev->all_adj_list.upper,
4944 &upper_dev->all_adj_list.lower);
4945}
4946
1d143d9f 4947static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4948 struct net_device *upper_dev,
4949 void *private, bool master)
2f268f12
VF
4950{
4951 int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4952
4953 if (ret)
4954 return ret;
4955
4956 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4957 &dev->adj_list.upper,
4958 &upper_dev->adj_list.lower,
402dae96 4959 private, master);
2f268f12
VF
4960 if (ret) {
4961 __netdev_adjacent_dev_unlink(dev, upper_dev);
4962 return ret;
4963 }
4964
4965 return 0;
5d261913
VF
4966}
4967
1d143d9f 4968static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4969 struct net_device *upper_dev)
2f268f12
VF
4970{
4971 __netdev_adjacent_dev_unlink(dev, upper_dev);
4972 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4973 &dev->adj_list.upper,
4974 &upper_dev->adj_list.lower);
4975}
5d261913 4976
9ff162a8 4977static int __netdev_upper_dev_link(struct net_device *dev,
402dae96
VF
4978 struct net_device *upper_dev, bool master,
4979 void *private)
9ff162a8 4980{
5d261913
VF
4981 struct netdev_adjacent *i, *j, *to_i, *to_j;
4982 int ret = 0;
9ff162a8
JP
4983
4984 ASSERT_RTNL();
4985
4986 if (dev == upper_dev)
4987 return -EBUSY;
4988
4989 /* To prevent loops, check if dev is not upper device to upper_dev. */
2f268f12 4990 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
9ff162a8
JP
4991 return -EBUSY;
4992
2f268f12 4993 if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
9ff162a8
JP
4994 return -EEXIST;
4995
4996 if (master && netdev_master_upper_dev_get(dev))
4997 return -EBUSY;
4998
402dae96
VF
4999 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5000 master);
5d261913
VF
5001 if (ret)
5002 return ret;
9ff162a8 5003
5d261913 5004 /* Now that we linked these devs, make all the upper_dev's
2f268f12 5005 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5d261913
VF
5006 * versa, and don't forget the devices itself. All of these
5007 * links are non-neighbours.
5008 */
2f268f12
VF
5009 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5010 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5011 pr_debug("Interlinking %s with %s, non-neighbour\n",
5012 i->dev->name, j->dev->name);
5d261913
VF
5013 ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5014 if (ret)
5015 goto rollback_mesh;
5016 }
5017 }
5018
5019 /* add dev to every upper_dev's upper device */
2f268f12
VF
5020 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5021 pr_debug("linking %s's upper device %s with %s\n",
5022 upper_dev->name, i->dev->name, dev->name);
5d261913
VF
5023 ret = __netdev_adjacent_dev_link(dev, i->dev);
5024 if (ret)
5025 goto rollback_upper_mesh;
5026 }
5027
5028 /* add upper_dev to every dev's lower device */
2f268f12
VF
5029 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5030 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5031 i->dev->name, upper_dev->name);
5d261913
VF
5032 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5033 if (ret)
5034 goto rollback_lower_mesh;
5035 }
9ff162a8 5036
42e52bf9 5037 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
9ff162a8 5038 return 0;
5d261913
VF
5039
5040rollback_lower_mesh:
5041 to_i = i;
2f268f12 5042 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5d261913
VF
5043 if (i == to_i)
5044 break;
5045 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5046 }
5047
5048 i = NULL;
5049
5050rollback_upper_mesh:
5051 to_i = i;
2f268f12 5052 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5d261913
VF
5053 if (i == to_i)
5054 break;
5055 __netdev_adjacent_dev_unlink(dev, i->dev);
5056 }
5057
5058 i = j = NULL;
5059
5060rollback_mesh:
5061 to_i = i;
5062 to_j = j;
2f268f12
VF
5063 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5064 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5d261913
VF
5065 if (i == to_i && j == to_j)
5066 break;
5067 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5068 }
5069 if (i == to_i)
5070 break;
5071 }
5072
2f268f12 5073 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5d261913
VF
5074
5075 return ret;
9ff162a8
JP
5076}
5077
5078/**
5079 * netdev_upper_dev_link - Add a link to the upper device
5080 * @dev: device
5081 * @upper_dev: new upper device
5082 *
5083 * Adds a link to device which is upper to this one. The caller must hold
5084 * the RTNL lock. On a failure a negative errno code is returned.
5085 * On success the reference counts are adjusted and the function
5086 * returns zero.
5087 */
5088int netdev_upper_dev_link(struct net_device *dev,
5089 struct net_device *upper_dev)
5090{
402dae96 5091 return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
9ff162a8
JP
5092}
5093EXPORT_SYMBOL(netdev_upper_dev_link);
5094
5095/**
5096 * netdev_master_upper_dev_link - Add a master link to the upper device
5097 * @dev: device
5098 * @upper_dev: new upper device
5099 *
5100 * Adds a link to device which is upper to this one. In this case, only
5101 * one master upper device can be linked, although other non-master devices
5102 * might be linked as well. The caller must hold the RTNL lock.
5103 * On a failure a negative errno code is returned. On success the reference
5104 * counts are adjusted and the function returns zero.
5105 */
5106int netdev_master_upper_dev_link(struct net_device *dev,
5107 struct net_device *upper_dev)
5108{
402dae96 5109 return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
9ff162a8
JP
5110}
5111EXPORT_SYMBOL(netdev_master_upper_dev_link);
5112
402dae96
VF
5113int netdev_master_upper_dev_link_private(struct net_device *dev,
5114 struct net_device *upper_dev,
5115 void *private)
5116{
5117 return __netdev_upper_dev_link(dev, upper_dev, true, private);
5118}
5119EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5120
9ff162a8
JP
5121/**
5122 * netdev_upper_dev_unlink - Removes a link to upper device
5123 * @dev: device
5124 * @upper_dev: new upper device
5125 *
5126 * Removes a link to device which is upper to this one. The caller must hold
5127 * the RTNL lock.
5128 */
5129void netdev_upper_dev_unlink(struct net_device *dev,
5130 struct net_device *upper_dev)
5131{
5d261913 5132 struct netdev_adjacent *i, *j;
9ff162a8
JP
5133 ASSERT_RTNL();
5134
2f268f12 5135 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5d261913
VF
5136
5137 /* Here is the tricky part. We must remove all dev's lower
5138 * devices from all upper_dev's upper devices and vice
5139 * versa, to maintain the graph relationship.
5140 */
2f268f12
VF
5141 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5142 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5d261913
VF
5143 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5144
5145 /* remove also the devices itself from lower/upper device
5146 * list
5147 */
2f268f12 5148 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5d261913
VF
5149 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5150
2f268f12 5151 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5d261913
VF
5152 __netdev_adjacent_dev_unlink(dev, i->dev);
5153
42e52bf9 5154 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
9ff162a8
JP
5155}
5156EXPORT_SYMBOL(netdev_upper_dev_unlink);
5157
5bb025fa 5158void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
402dae96 5159{
5bb025fa 5160 struct netdev_adjacent *iter;
402dae96 5161
5bb025fa
VF
5162 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5163 netdev_adjacent_sysfs_del(iter->dev, oldname,
5164 &iter->dev->adj_list.lower);
5165 netdev_adjacent_sysfs_add(iter->dev, dev,
5166 &iter->dev->adj_list.lower);
5167 }
402dae96 5168
5bb025fa
VF
5169 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5170 netdev_adjacent_sysfs_del(iter->dev, oldname,
5171 &iter->dev->adj_list.upper);
5172 netdev_adjacent_sysfs_add(iter->dev, dev,
5173 &iter->dev->adj_list.upper);
5174 }
402dae96 5175}
402dae96
VF
5176
5177void *netdev_lower_dev_get_private(struct net_device *dev,
5178 struct net_device *lower_dev)
5179{
5180 struct netdev_adjacent *lower;
5181
5182 if (!lower_dev)
5183 return NULL;
5184 lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5185 if (!lower)
5186 return NULL;
5187
5188 return lower->private;
5189}
5190EXPORT_SYMBOL(netdev_lower_dev_get_private);
5191
4085ebe8
VY
5192
5193int dev_get_nest_level(struct net_device *dev,
5194 bool (*type_check)(struct net_device *dev))
5195{
5196 struct net_device *lower = NULL;
5197 struct list_head *iter;
5198 int max_nest = -1;
5199 int nest;
5200
5201 ASSERT_RTNL();
5202
5203 netdev_for_each_lower_dev(dev, lower, iter) {
5204 nest = dev_get_nest_level(lower, type_check);
5205 if (max_nest < nest)
5206 max_nest = nest;
5207 }
5208
5209 if (type_check(dev))
5210 max_nest++;
5211
5212 return max_nest;
5213}
5214EXPORT_SYMBOL(dev_get_nest_level);
5215
b6c40d68
PM
5216static void dev_change_rx_flags(struct net_device *dev, int flags)
5217{
d314774c
SH
5218 const struct net_device_ops *ops = dev->netdev_ops;
5219
d2615bf4 5220 if (ops->ndo_change_rx_flags)
d314774c 5221 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
5222}
5223
991fb3f7 5224static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
1da177e4 5225{
b536db93 5226 unsigned int old_flags = dev->flags;
d04a48b0
EB
5227 kuid_t uid;
5228 kgid_t gid;
1da177e4 5229
24023451
PM
5230 ASSERT_RTNL();
5231
dad9b335
WC
5232 dev->flags |= IFF_PROMISC;
5233 dev->promiscuity += inc;
5234 if (dev->promiscuity == 0) {
5235 /*
5236 * Avoid overflow.
5237 * If inc causes overflow, untouch promisc and return error.
5238 */
5239 if (inc < 0)
5240 dev->flags &= ~IFF_PROMISC;
5241 else {
5242 dev->promiscuity -= inc;
7b6cd1ce
JP
5243 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5244 dev->name);
dad9b335
WC
5245 return -EOVERFLOW;
5246 }
5247 }
52609c0b 5248 if (dev->flags != old_flags) {
7b6cd1ce
JP
5249 pr_info("device %s %s promiscuous mode\n",
5250 dev->name,
5251 dev->flags & IFF_PROMISC ? "entered" : "left");
8192b0c4
DH
5252 if (audit_enabled) {
5253 current_uid_gid(&uid, &gid);
7759db82
KHK
5254 audit_log(current->audit_context, GFP_ATOMIC,
5255 AUDIT_ANOM_PROMISCUOUS,
5256 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5257 dev->name, (dev->flags & IFF_PROMISC),
5258 (old_flags & IFF_PROMISC),
e1760bd5 5259 from_kuid(&init_user_ns, audit_get_loginuid(current)),
d04a48b0
EB
5260 from_kuid(&init_user_ns, uid),
5261 from_kgid(&init_user_ns, gid),
7759db82 5262 audit_get_sessionid(current));
8192b0c4 5263 }
24023451 5264
b6c40d68 5265 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 5266 }
991fb3f7
ND
5267 if (notify)
5268 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
dad9b335 5269 return 0;
1da177e4
LT
5270}
5271
4417da66
PM
5272/**
5273 * dev_set_promiscuity - update promiscuity count on a device
5274 * @dev: device
5275 * @inc: modifier
5276 *
5277 * Add or remove promiscuity from a device. While the count in the device
5278 * remains above zero the interface remains promiscuous. Once it hits zero
5279 * the device reverts back to normal filtering operation. A negative inc
5280 * value is used to drop promiscuity on the device.
dad9b335 5281 * Return 0 if successful or a negative errno code on error.
4417da66 5282 */
dad9b335 5283int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66 5284{
b536db93 5285 unsigned int old_flags = dev->flags;
dad9b335 5286 int err;
4417da66 5287
991fb3f7 5288 err = __dev_set_promiscuity(dev, inc, true);
4b5a698e 5289 if (err < 0)
dad9b335 5290 return err;
4417da66
PM
5291 if (dev->flags != old_flags)
5292 dev_set_rx_mode(dev);
dad9b335 5293 return err;
4417da66 5294}
d1b19dff 5295EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 5296
991fb3f7 5297static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
1da177e4 5298{
991fb3f7 5299 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
1da177e4 5300
24023451
PM
5301 ASSERT_RTNL();
5302
1da177e4 5303 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
5304 dev->allmulti += inc;
5305 if (dev->allmulti == 0) {
5306 /*
5307 * Avoid overflow.
5308 * If inc causes overflow, untouch allmulti and return error.
5309 */
5310 if (inc < 0)
5311 dev->flags &= ~IFF_ALLMULTI;
5312 else {
5313 dev->allmulti -= inc;
7b6cd1ce
JP
5314 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5315 dev->name);
dad9b335
WC
5316 return -EOVERFLOW;
5317 }
5318 }
24023451 5319 if (dev->flags ^ old_flags) {
b6c40d68 5320 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 5321 dev_set_rx_mode(dev);
991fb3f7
ND
5322 if (notify)
5323 __dev_notify_flags(dev, old_flags,
5324 dev->gflags ^ old_gflags);
24023451 5325 }
dad9b335 5326 return 0;
4417da66 5327}
991fb3f7
ND
5328
5329/**
5330 * dev_set_allmulti - update allmulti count on a device
5331 * @dev: device
5332 * @inc: modifier
5333 *
5334 * Add or remove reception of all multicast frames to a device. While the
5335 * count in the device remains above zero the interface remains listening
5336 * to all interfaces. Once it hits zero the device reverts back to normal
5337 * filtering operation. A negative @inc value is used to drop the counter
5338 * when releasing a resource needing all multicasts.
5339 * Return 0 if successful or a negative errno code on error.
5340 */
5341
5342int dev_set_allmulti(struct net_device *dev, int inc)
5343{
5344 return __dev_set_allmulti(dev, inc, true);
5345}
d1b19dff 5346EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
5347
5348/*
5349 * Upload unicast and multicast address lists to device and
5350 * configure RX filtering. When the device doesn't support unicast
53ccaae1 5351 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
5352 * are present.
5353 */
5354void __dev_set_rx_mode(struct net_device *dev)
5355{
d314774c
SH
5356 const struct net_device_ops *ops = dev->netdev_ops;
5357
4417da66
PM
5358 /* dev_open will call this function so the list will stay sane. */
5359 if (!(dev->flags&IFF_UP))
5360 return;
5361
5362 if (!netif_device_present(dev))
40b77c94 5363 return;
4417da66 5364
01789349 5365 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4417da66
PM
5366 /* Unicast addresses changes may only happen under the rtnl,
5367 * therefore calling __dev_set_promiscuity here is safe.
5368 */
32e7bfc4 5369 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
991fb3f7 5370 __dev_set_promiscuity(dev, 1, false);
2d348d1f 5371 dev->uc_promisc = true;
32e7bfc4 5372 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
991fb3f7 5373 __dev_set_promiscuity(dev, -1, false);
2d348d1f 5374 dev->uc_promisc = false;
4417da66 5375 }
4417da66 5376 }
01789349
JP
5377
5378 if (ops->ndo_set_rx_mode)
5379 ops->ndo_set_rx_mode(dev);
4417da66
PM
5380}
5381
5382void dev_set_rx_mode(struct net_device *dev)
5383{
b9e40857 5384 netif_addr_lock_bh(dev);
4417da66 5385 __dev_set_rx_mode(dev);
b9e40857 5386 netif_addr_unlock_bh(dev);
1da177e4
LT
5387}
5388
f0db275a
SH
5389/**
5390 * dev_get_flags - get flags reported to userspace
5391 * @dev: device
5392 *
5393 * Get the combination of flag bits exported through APIs to userspace.
5394 */
95c96174 5395unsigned int dev_get_flags(const struct net_device *dev)
1da177e4 5396{
95c96174 5397 unsigned int flags;
1da177e4
LT
5398
5399 flags = (dev->flags & ~(IFF_PROMISC |
5400 IFF_ALLMULTI |
b00055aa
SR
5401 IFF_RUNNING |
5402 IFF_LOWER_UP |
5403 IFF_DORMANT)) |
1da177e4
LT
5404 (dev->gflags & (IFF_PROMISC |
5405 IFF_ALLMULTI));
5406
b00055aa
SR
5407 if (netif_running(dev)) {
5408 if (netif_oper_up(dev))
5409 flags |= IFF_RUNNING;
5410 if (netif_carrier_ok(dev))
5411 flags |= IFF_LOWER_UP;
5412 if (netif_dormant(dev))
5413 flags |= IFF_DORMANT;
5414 }
1da177e4
LT
5415
5416 return flags;
5417}
d1b19dff 5418EXPORT_SYMBOL(dev_get_flags);
1da177e4 5419
bd380811 5420int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 5421{
b536db93 5422 unsigned int old_flags = dev->flags;
bd380811 5423 int ret;
1da177e4 5424
24023451
PM
5425 ASSERT_RTNL();
5426
1da177e4
LT
5427 /*
5428 * Set the flags on our device.
5429 */
5430
5431 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5432 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5433 IFF_AUTOMEDIA)) |
5434 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5435 IFF_ALLMULTI));
5436
5437 /*
5438 * Load in the correct multicast list now the flags have changed.
5439 */
5440
b6c40d68
PM
5441 if ((old_flags ^ flags) & IFF_MULTICAST)
5442 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 5443
4417da66 5444 dev_set_rx_mode(dev);
1da177e4
LT
5445
5446 /*
5447 * Have we downed the interface. We handle IFF_UP ourselves
5448 * according to user attempts to set it, rather than blindly
5449 * setting it.
5450 */
5451
5452 ret = 0;
d215d10f 5453 if ((old_flags ^ flags) & IFF_UP)
bd380811 5454 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4 5455
1da177e4 5456 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff 5457 int inc = (flags & IFF_PROMISC) ? 1 : -1;
991fb3f7 5458 unsigned int old_flags = dev->flags;
d1b19dff 5459
1da177e4 5460 dev->gflags ^= IFF_PROMISC;
991fb3f7
ND
5461
5462 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5463 if (dev->flags != old_flags)
5464 dev_set_rx_mode(dev);
1da177e4
LT
5465 }
5466
5467 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5468 is important. Some (broken) drivers set IFF_PROMISC, when
5469 IFF_ALLMULTI is requested not asking us and not reporting.
5470 */
5471 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
5472 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5473
1da177e4 5474 dev->gflags ^= IFF_ALLMULTI;
991fb3f7 5475 __dev_set_allmulti(dev, inc, false);
1da177e4
LT
5476 }
5477
bd380811
PM
5478 return ret;
5479}
5480
a528c219
ND
5481void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5482 unsigned int gchanges)
bd380811
PM
5483{
5484 unsigned int changes = dev->flags ^ old_flags;
5485
a528c219 5486 if (gchanges)
7f294054 5487 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
a528c219 5488
bd380811
PM
5489 if (changes & IFF_UP) {
5490 if (dev->flags & IFF_UP)
5491 call_netdevice_notifiers(NETDEV_UP, dev);
5492 else
5493 call_netdevice_notifiers(NETDEV_DOWN, dev);
5494 }
5495
5496 if (dev->flags & IFF_UP &&
be9efd36
JP
5497 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5498 struct netdev_notifier_change_info change_info;
5499
5500 change_info.flags_changed = changes;
5501 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5502 &change_info.info);
5503 }
bd380811
PM
5504}
5505
5506/**
5507 * dev_change_flags - change device settings
5508 * @dev: device
5509 * @flags: device state flags
5510 *
5511 * Change settings on device based state flags. The flags are
5512 * in the userspace exported format.
5513 */
b536db93 5514int dev_change_flags(struct net_device *dev, unsigned int flags)
bd380811 5515{
b536db93 5516 int ret;
991fb3f7 5517 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
bd380811
PM
5518
5519 ret = __dev_change_flags(dev, flags);
5520 if (ret < 0)
5521 return ret;
5522
991fb3f7 5523 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
a528c219 5524 __dev_notify_flags(dev, old_flags, changes);
1da177e4
LT
5525 return ret;
5526}
d1b19dff 5527EXPORT_SYMBOL(dev_change_flags);
1da177e4 5528
2315dc91
VF
5529static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5530{
5531 const struct net_device_ops *ops = dev->netdev_ops;
5532
5533 if (ops->ndo_change_mtu)
5534 return ops->ndo_change_mtu(dev, new_mtu);
5535
5536 dev->mtu = new_mtu;
5537 return 0;
5538}
5539
f0db275a
SH
5540/**
5541 * dev_set_mtu - Change maximum transfer unit
5542 * @dev: device
5543 * @new_mtu: new transfer unit
5544 *
5545 * Change the maximum transfer size of the network device.
5546 */
1da177e4
LT
5547int dev_set_mtu(struct net_device *dev, int new_mtu)
5548{
2315dc91 5549 int err, orig_mtu;
1da177e4
LT
5550
5551 if (new_mtu == dev->mtu)
5552 return 0;
5553
5554 /* MTU must be positive. */
5555 if (new_mtu < 0)
5556 return -EINVAL;
5557
5558 if (!netif_device_present(dev))
5559 return -ENODEV;
5560
1d486bfb
VF
5561 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5562 err = notifier_to_errno(err);
5563 if (err)
5564 return err;
d314774c 5565
2315dc91
VF
5566 orig_mtu = dev->mtu;
5567 err = __dev_set_mtu(dev, new_mtu);
d314774c 5568
2315dc91
VF
5569 if (!err) {
5570 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5571 err = notifier_to_errno(err);
5572 if (err) {
5573 /* setting mtu back and notifying everyone again,
5574 * so that they have a chance to revert changes.
5575 */
5576 __dev_set_mtu(dev, orig_mtu);
5577 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5578 }
5579 }
1da177e4
LT
5580 return err;
5581}
d1b19dff 5582EXPORT_SYMBOL(dev_set_mtu);
1da177e4 5583
cbda10fa
VD
5584/**
5585 * dev_set_group - Change group this device belongs to
5586 * @dev: device
5587 * @new_group: group this device should belong to
5588 */
5589void dev_set_group(struct net_device *dev, int new_group)
5590{
5591 dev->group = new_group;
5592}
5593EXPORT_SYMBOL(dev_set_group);
5594
f0db275a
SH
5595/**
5596 * dev_set_mac_address - Change Media Access Control Address
5597 * @dev: device
5598 * @sa: new address
5599 *
5600 * Change the hardware (MAC) address of the device
5601 */
1da177e4
LT
5602int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5603{
d314774c 5604 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
5605 int err;
5606
d314774c 5607 if (!ops->ndo_set_mac_address)
1da177e4
LT
5608 return -EOPNOTSUPP;
5609 if (sa->sa_family != dev->type)
5610 return -EINVAL;
5611 if (!netif_device_present(dev))
5612 return -ENODEV;
d314774c 5613 err = ops->ndo_set_mac_address(dev, sa);
f6521516
JP
5614 if (err)
5615 return err;
fbdeca2d 5616 dev->addr_assign_type = NET_ADDR_SET;
f6521516 5617 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7bf23575 5618 add_device_randomness(dev->dev_addr, dev->addr_len);
f6521516 5619 return 0;
1da177e4 5620}
d1b19dff 5621EXPORT_SYMBOL(dev_set_mac_address);
1da177e4 5622
4bf84c35
JP
5623/**
5624 * dev_change_carrier - Change device carrier
5625 * @dev: device
691b3b7e 5626 * @new_carrier: new value
4bf84c35
JP
5627 *
5628 * Change device carrier
5629 */
5630int dev_change_carrier(struct net_device *dev, bool new_carrier)
5631{
5632 const struct net_device_ops *ops = dev->netdev_ops;
5633
5634 if (!ops->ndo_change_carrier)
5635 return -EOPNOTSUPP;
5636 if (!netif_device_present(dev))
5637 return -ENODEV;
5638 return ops->ndo_change_carrier(dev, new_carrier);
5639}
5640EXPORT_SYMBOL(dev_change_carrier);
5641
66b52b0d
JP
5642/**
5643 * dev_get_phys_port_id - Get device physical port ID
5644 * @dev: device
5645 * @ppid: port ID
5646 *
5647 * Get device physical port ID
5648 */
5649int dev_get_phys_port_id(struct net_device *dev,
5650 struct netdev_phys_port_id *ppid)
5651{
5652 const struct net_device_ops *ops = dev->netdev_ops;
5653
5654 if (!ops->ndo_get_phys_port_id)
5655 return -EOPNOTSUPP;
5656 return ops->ndo_get_phys_port_id(dev, ppid);
5657}
5658EXPORT_SYMBOL(dev_get_phys_port_id);
5659
1da177e4
LT
5660/**
5661 * dev_new_index - allocate an ifindex
c4ea43c5 5662 * @net: the applicable net namespace
1da177e4
LT
5663 *
5664 * Returns a suitable unique value for a new device interface
5665 * number. The caller must hold the rtnl semaphore or the
5666 * dev_base_lock to be sure it remains unique.
5667 */
881d966b 5668static int dev_new_index(struct net *net)
1da177e4 5669{
aa79e66e 5670 int ifindex = net->ifindex;
1da177e4
LT
5671 for (;;) {
5672 if (++ifindex <= 0)
5673 ifindex = 1;
881d966b 5674 if (!__dev_get_by_index(net, ifindex))
aa79e66e 5675 return net->ifindex = ifindex;
1da177e4
LT
5676 }
5677}
5678
1da177e4 5679/* Delayed registration/unregisteration */
3b5b34fd 5680static LIST_HEAD(net_todo_list);
200b916f 5681DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
1da177e4 5682
6f05f629 5683static void net_set_todo(struct net_device *dev)
1da177e4 5684{
1da177e4 5685 list_add_tail(&dev->todo_list, &net_todo_list);
50624c93 5686 dev_net(dev)->dev_unreg_count++;
1da177e4
LT
5687}
5688
9b5e383c 5689static void rollback_registered_many(struct list_head *head)
93ee31f1 5690{
e93737b0 5691 struct net_device *dev, *tmp;
5cde2829 5692 LIST_HEAD(close_head);
9b5e383c 5693
93ee31f1
DL
5694 BUG_ON(dev_boot_phase);
5695 ASSERT_RTNL();
5696
e93737b0 5697 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 5698 /* Some devices call without registering
e93737b0
KK
5699 * for initialization unwind. Remove those
5700 * devices and proceed with the remaining.
9b5e383c
ED
5701 */
5702 if (dev->reg_state == NETREG_UNINITIALIZED) {
7b6cd1ce
JP
5703 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5704 dev->name, dev);
93ee31f1 5705
9b5e383c 5706 WARN_ON(1);
e93737b0
KK
5707 list_del(&dev->unreg_list);
5708 continue;
9b5e383c 5709 }
449f4544 5710 dev->dismantle = true;
9b5e383c 5711 BUG_ON(dev->reg_state != NETREG_REGISTERED);
44345724 5712 }
93ee31f1 5713
44345724 5714 /* If device is running, close it first. */
5cde2829
EB
5715 list_for_each_entry(dev, head, unreg_list)
5716 list_add_tail(&dev->close_list, &close_head);
5717 dev_close_many(&close_head);
93ee31f1 5718
44345724 5719 list_for_each_entry(dev, head, unreg_list) {
9b5e383c
ED
5720 /* And unlink it from device chain. */
5721 unlist_netdevice(dev);
93ee31f1 5722
9b5e383c
ED
5723 dev->reg_state = NETREG_UNREGISTERING;
5724 }
93ee31f1
DL
5725
5726 synchronize_net();
5727
9b5e383c
ED
5728 list_for_each_entry(dev, head, unreg_list) {
5729 /* Shutdown queueing discipline. */
5730 dev_shutdown(dev);
93ee31f1
DL
5731
5732
9b5e383c
ED
5733 /* Notify protocols, that we are about to destroy
5734 this device. They should clean all the things.
5735 */
5736 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 5737
9b5e383c
ED
5738 /*
5739 * Flush the unicast and multicast chains
5740 */
a748ee24 5741 dev_uc_flush(dev);
22bedad3 5742 dev_mc_flush(dev);
93ee31f1 5743
9b5e383c
ED
5744 if (dev->netdev_ops->ndo_uninit)
5745 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 5746
56bfa7ee
RP
5747 if (!dev->rtnl_link_ops ||
5748 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5749 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5750
9ff162a8
JP
5751 /* Notifier chain MUST detach us all upper devices. */
5752 WARN_ON(netdev_has_any_upper_dev(dev));
93ee31f1 5753
9b5e383c
ED
5754 /* Remove entries from kobject tree */
5755 netdev_unregister_kobject(dev);
024e9679
AD
5756#ifdef CONFIG_XPS
5757 /* Remove XPS queueing entries */
5758 netif_reset_xps_queues_gt(dev, 0);
5759#endif
9b5e383c 5760 }
93ee31f1 5761
850a545b 5762 synchronize_net();
395264d5 5763
a5ee1551 5764 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
5765 dev_put(dev);
5766}
5767
5768static void rollback_registered(struct net_device *dev)
5769{
5770 LIST_HEAD(single);
5771
5772 list_add(&dev->unreg_list, &single);
5773 rollback_registered_many(&single);
ceaaec98 5774 list_del(&single);
93ee31f1
DL
5775}
5776
c8f44aff
MM
5777static netdev_features_t netdev_fix_features(struct net_device *dev,
5778 netdev_features_t features)
b63365a2 5779{
57422dc5
MM
5780 /* Fix illegal checksum combinations */
5781 if ((features & NETIF_F_HW_CSUM) &&
5782 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5783 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
57422dc5
MM
5784 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5785 }
5786
b63365a2 5787 /* TSO requires that SG is present as well. */
ea2d3688 5788 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6f404e44 5789 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
ea2d3688 5790 features &= ~NETIF_F_ALL_TSO;
b63365a2
HX
5791 }
5792
ec5f0615
PS
5793 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5794 !(features & NETIF_F_IP_CSUM)) {
5795 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5796 features &= ~NETIF_F_TSO;
5797 features &= ~NETIF_F_TSO_ECN;
5798 }
5799
5800 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5801 !(features & NETIF_F_IPV6_CSUM)) {
5802 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5803 features &= ~NETIF_F_TSO6;
5804 }
5805
31d8b9e0
BH
5806 /* TSO ECN requires that TSO is present as well. */
5807 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5808 features &= ~NETIF_F_TSO_ECN;
5809
212b573f
MM
5810 /* Software GSO depends on SG. */
5811 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6f404e44 5812 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
212b573f
MM
5813 features &= ~NETIF_F_GSO;
5814 }
5815
acd1130e 5816 /* UFO needs SG and checksumming */
b63365a2 5817 if (features & NETIF_F_UFO) {
79032644
MM
5818 /* maybe split UFO into V4 and V6? */
5819 if (!((features & NETIF_F_GEN_CSUM) ||
5820 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5821 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5822 netdev_dbg(dev,
acd1130e 5823 "Dropping NETIF_F_UFO since no checksum offload features.\n");
b63365a2
HX
5824 features &= ~NETIF_F_UFO;
5825 }
5826
5827 if (!(features & NETIF_F_SG)) {
6f404e44 5828 netdev_dbg(dev,
acd1130e 5829 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
b63365a2
HX
5830 features &= ~NETIF_F_UFO;
5831 }
5832 }
5833
d0290214
JP
5834#ifdef CONFIG_NET_RX_BUSY_POLL
5835 if (dev->netdev_ops->ndo_busy_poll)
5836 features |= NETIF_F_BUSY_POLL;
5837 else
5838#endif
5839 features &= ~NETIF_F_BUSY_POLL;
5840
b63365a2
HX
5841 return features;
5842}
b63365a2 5843
6cb6a27c 5844int __netdev_update_features(struct net_device *dev)
5455c699 5845{
c8f44aff 5846 netdev_features_t features;
5455c699
MM
5847 int err = 0;
5848
87267485
MM
5849 ASSERT_RTNL();
5850
5455c699
MM
5851 features = netdev_get_wanted_features(dev);
5852
5853 if (dev->netdev_ops->ndo_fix_features)
5854 features = dev->netdev_ops->ndo_fix_features(dev, features);
5855
5856 /* driver might be less strict about feature dependencies */
5857 features = netdev_fix_features(dev, features);
5858
5859 if (dev->features == features)
6cb6a27c 5860 return 0;
5455c699 5861
c8f44aff
MM
5862 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5863 &dev->features, &features);
5455c699
MM
5864
5865 if (dev->netdev_ops->ndo_set_features)
5866 err = dev->netdev_ops->ndo_set_features(dev, features);
5867
6cb6a27c 5868 if (unlikely(err < 0)) {
5455c699 5869 netdev_err(dev,
c8f44aff
MM
5870 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5871 err, &features, &dev->features);
6cb6a27c
MM
5872 return -1;
5873 }
5874
5875 if (!err)
5876 dev->features = features;
5877
5878 return 1;
5879}
5880
afe12cc8
MM
5881/**
5882 * netdev_update_features - recalculate device features
5883 * @dev: the device to check
5884 *
5885 * Recalculate dev->features set and send notifications if it
5886 * has changed. Should be called after driver or hardware dependent
5887 * conditions might have changed that influence the features.
5888 */
6cb6a27c
MM
5889void netdev_update_features(struct net_device *dev)
5890{
5891 if (__netdev_update_features(dev))
5892 netdev_features_change(dev);
5455c699
MM
5893}
5894EXPORT_SYMBOL(netdev_update_features);
5895
afe12cc8
MM
5896/**
5897 * netdev_change_features - recalculate device features
5898 * @dev: the device to check
5899 *
5900 * Recalculate dev->features set and send notifications even
5901 * if they have not changed. Should be called instead of
5902 * netdev_update_features() if also dev->vlan_features might
5903 * have changed to allow the changes to be propagated to stacked
5904 * VLAN devices.
5905 */
5906void netdev_change_features(struct net_device *dev)
5907{
5908 __netdev_update_features(dev);
5909 netdev_features_change(dev);
5910}
5911EXPORT_SYMBOL(netdev_change_features);
5912
fc4a7489
PM
5913/**
5914 * netif_stacked_transfer_operstate - transfer operstate
5915 * @rootdev: the root or lower level device to transfer state from
5916 * @dev: the device to transfer operstate to
5917 *
5918 * Transfer operational state from root to device. This is normally
5919 * called when a stacking relationship exists between the root
5920 * device and the device(a leaf device).
5921 */
5922void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5923 struct net_device *dev)
5924{
5925 if (rootdev->operstate == IF_OPER_DORMANT)
5926 netif_dormant_on(dev);
5927 else
5928 netif_dormant_off(dev);
5929
5930 if (netif_carrier_ok(rootdev)) {
5931 if (!netif_carrier_ok(dev))
5932 netif_carrier_on(dev);
5933 } else {
5934 if (netif_carrier_ok(dev))
5935 netif_carrier_off(dev);
5936 }
5937}
5938EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5939
a953be53 5940#ifdef CONFIG_SYSFS
1b4bf461
ED
5941static int netif_alloc_rx_queues(struct net_device *dev)
5942{
1b4bf461 5943 unsigned int i, count = dev->num_rx_queues;
bd25fa7b 5944 struct netdev_rx_queue *rx;
1b4bf461 5945
bd25fa7b 5946 BUG_ON(count < 1);
1b4bf461 5947
bd25fa7b 5948 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
62b5942a 5949 if (!rx)
bd25fa7b 5950 return -ENOMEM;
62b5942a 5951
bd25fa7b
TH
5952 dev->_rx = rx;
5953
bd25fa7b 5954 for (i = 0; i < count; i++)
fe822240 5955 rx[i].dev = dev;
1b4bf461
ED
5956 return 0;
5957}
bf264145 5958#endif
1b4bf461 5959
aa942104
CG
5960static void netdev_init_one_queue(struct net_device *dev,
5961 struct netdev_queue *queue, void *_unused)
5962{
5963 /* Initialize queue lock */
5964 spin_lock_init(&queue->_xmit_lock);
5965 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5966 queue->xmit_lock_owner = -1;
b236da69 5967 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
aa942104 5968 queue->dev = dev;
114cf580
TH
5969#ifdef CONFIG_BQL
5970 dql_init(&queue->dql, HZ);
5971#endif
aa942104
CG
5972}
5973
60877a32
ED
5974static void netif_free_tx_queues(struct net_device *dev)
5975{
4cb28970 5976 kvfree(dev->_tx);
60877a32
ED
5977}
5978
e6484930
TH
5979static int netif_alloc_netdev_queues(struct net_device *dev)
5980{
5981 unsigned int count = dev->num_tx_queues;
5982 struct netdev_queue *tx;
60877a32 5983 size_t sz = count * sizeof(*tx);
e6484930 5984
60877a32 5985 BUG_ON(count < 1 || count > 0xffff);
62b5942a 5986
60877a32
ED
5987 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5988 if (!tx) {
5989 tx = vzalloc(sz);
5990 if (!tx)
5991 return -ENOMEM;
5992 }
e6484930 5993 dev->_tx = tx;
1d24eb48 5994
e6484930
TH
5995 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5996 spin_lock_init(&dev->tx_global_lock);
aa942104
CG
5997
5998 return 0;
e6484930
TH
5999}
6000
1da177e4
LT
6001/**
6002 * register_netdevice - register a network device
6003 * @dev: device to register
6004 *
6005 * Take a completed network device structure and add it to the kernel
6006 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6007 * chain. 0 is returned on success. A negative errno code is returned
6008 * on a failure to set up the device, or if the name is a duplicate.
6009 *
6010 * Callers must hold the rtnl semaphore. You may want
6011 * register_netdev() instead of this.
6012 *
6013 * BUGS:
6014 * The locking appears insufficient to guarantee two parallel registers
6015 * will not get the same name.
6016 */
6017
6018int register_netdevice(struct net_device *dev)
6019{
1da177e4 6020 int ret;
d314774c 6021 struct net *net = dev_net(dev);
1da177e4
LT
6022
6023 BUG_ON(dev_boot_phase);
6024 ASSERT_RTNL();
6025
b17a7c17
SH
6026 might_sleep();
6027
1da177e4
LT
6028 /* When net_device's are persistent, this will be fatal. */
6029 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 6030 BUG_ON(!net);
1da177e4 6031
f1f28aa3 6032 spin_lock_init(&dev->addr_list_lock);
cf508b12 6033 netdev_set_addr_lockdep_class(dev);
1da177e4 6034
1da177e4
LT
6035 dev->iflink = -1;
6036
828de4f6 6037 ret = dev_get_valid_name(net, dev, dev->name);
0696c3a8
PP
6038 if (ret < 0)
6039 goto out;
6040
1da177e4 6041 /* Init, if this function is available */
d314774c
SH
6042 if (dev->netdev_ops->ndo_init) {
6043 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
6044 if (ret) {
6045 if (ret > 0)
6046 ret = -EIO;
90833aa4 6047 goto out;
1da177e4
LT
6048 }
6049 }
4ec93edb 6050
f646968f
PM
6051 if (((dev->hw_features | dev->features) &
6052 NETIF_F_HW_VLAN_CTAG_FILTER) &&
d2ed273d
MM
6053 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6054 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6055 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6056 ret = -EINVAL;
6057 goto err_uninit;
6058 }
6059
9c7dafbf
PE
6060 ret = -EBUSY;
6061 if (!dev->ifindex)
6062 dev->ifindex = dev_new_index(net);
6063 else if (__dev_get_by_index(net, dev->ifindex))
6064 goto err_uninit;
6065
1da177e4
LT
6066 if (dev->iflink == -1)
6067 dev->iflink = dev->ifindex;
6068
5455c699
MM
6069 /* Transfer changeable features to wanted_features and enable
6070 * software offloads (GSO and GRO).
6071 */
6072 dev->hw_features |= NETIF_F_SOFT_FEATURES;
14d1232f
MM
6073 dev->features |= NETIF_F_SOFT_FEATURES;
6074 dev->wanted_features = dev->features & dev->hw_features;
1da177e4 6075
34324dc2
MM
6076 if (!(dev->flags & IFF_LOOPBACK)) {
6077 dev->hw_features |= NETIF_F_NOCACHE_COPY;
c6e1a0d1
TH
6078 }
6079
1180e7d6 6080 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
16c3ea78 6081 */
1180e7d6 6082 dev->vlan_features |= NETIF_F_HIGHDMA;
16c3ea78 6083
ee579677
PS
6084 /* Make NETIF_F_SG inheritable to tunnel devices.
6085 */
6086 dev->hw_enc_features |= NETIF_F_SG;
6087
0d89d203
SH
6088 /* Make NETIF_F_SG inheritable to MPLS.
6089 */
6090 dev->mpls_features |= NETIF_F_SG;
6091
7ffbe3fd
JB
6092 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6093 ret = notifier_to_errno(ret);
6094 if (ret)
6095 goto err_uninit;
6096
8b41d188 6097 ret = netdev_register_kobject(dev);
b17a7c17 6098 if (ret)
7ce1b0ed 6099 goto err_uninit;
b17a7c17
SH
6100 dev->reg_state = NETREG_REGISTERED;
6101
6cb6a27c 6102 __netdev_update_features(dev);
8e9b59b2 6103
1da177e4
LT
6104 /*
6105 * Default initial state at registry is that the
6106 * device is present.
6107 */
6108
6109 set_bit(__LINK_STATE_PRESENT, &dev->state);
6110
8f4cccbb
BH
6111 linkwatch_init_dev(dev);
6112
1da177e4 6113 dev_init_scheduler(dev);
1da177e4 6114 dev_hold(dev);
ce286d32 6115 list_netdevice(dev);
7bf23575 6116 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 6117
948b337e
JP
6118 /* If the device has permanent device address, driver should
6119 * set dev_addr and also addr_assign_type should be set to
6120 * NET_ADDR_PERM (default value).
6121 */
6122 if (dev->addr_assign_type == NET_ADDR_PERM)
6123 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6124
1da177e4 6125 /* Notify protocols, that a new device appeared. */
056925ab 6126 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 6127 ret = notifier_to_errno(ret);
93ee31f1
DL
6128 if (ret) {
6129 rollback_registered(dev);
6130 dev->reg_state = NETREG_UNREGISTERED;
6131 }
d90a909e
EB
6132 /*
6133 * Prevent userspace races by waiting until the network
6134 * device is fully setup before sending notifications.
6135 */
a2835763
PM
6136 if (!dev->rtnl_link_ops ||
6137 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7f294054 6138 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
1da177e4
LT
6139
6140out:
6141 return ret;
7ce1b0ed
HX
6142
6143err_uninit:
d314774c
SH
6144 if (dev->netdev_ops->ndo_uninit)
6145 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 6146 goto out;
1da177e4 6147}
d1b19dff 6148EXPORT_SYMBOL(register_netdevice);
1da177e4 6149
937f1ba5
BH
6150/**
6151 * init_dummy_netdev - init a dummy network device for NAPI
6152 * @dev: device to init
6153 *
6154 * This takes a network device structure and initialize the minimum
6155 * amount of fields so it can be used to schedule NAPI polls without
6156 * registering a full blown interface. This is to be used by drivers
6157 * that need to tie several hardware interfaces to a single NAPI
6158 * poll scheduler due to HW limitations.
6159 */
6160int init_dummy_netdev(struct net_device *dev)
6161{
6162 /* Clear everything. Note we don't initialize spinlocks
6163 * are they aren't supposed to be taken by any of the
6164 * NAPI code and this dummy netdev is supposed to be
6165 * only ever used for NAPI polls
6166 */
6167 memset(dev, 0, sizeof(struct net_device));
6168
6169 /* make sure we BUG if trying to hit standard
6170 * register/unregister code path
6171 */
6172 dev->reg_state = NETREG_DUMMY;
6173
937f1ba5
BH
6174 /* NAPI wants this */
6175 INIT_LIST_HEAD(&dev->napi_list);
6176
6177 /* a dummy interface is started by default */
6178 set_bit(__LINK_STATE_PRESENT, &dev->state);
6179 set_bit(__LINK_STATE_START, &dev->state);
6180
29b4433d
ED
6181 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6182 * because users of this 'device' dont need to change
6183 * its refcount.
6184 */
6185
937f1ba5
BH
6186 return 0;
6187}
6188EXPORT_SYMBOL_GPL(init_dummy_netdev);
6189
6190
1da177e4
LT
6191/**
6192 * register_netdev - register a network device
6193 * @dev: device to register
6194 *
6195 * Take a completed network device structure and add it to the kernel
6196 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6197 * chain. 0 is returned on success. A negative errno code is returned
6198 * on a failure to set up the device, or if the name is a duplicate.
6199 *
38b4da38 6200 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
6201 * and expands the device name if you passed a format string to
6202 * alloc_netdev.
6203 */
6204int register_netdev(struct net_device *dev)
6205{
6206 int err;
6207
6208 rtnl_lock();
1da177e4 6209 err = register_netdevice(dev);
1da177e4
LT
6210 rtnl_unlock();
6211 return err;
6212}
6213EXPORT_SYMBOL(register_netdev);
6214
29b4433d
ED
6215int netdev_refcnt_read(const struct net_device *dev)
6216{
6217 int i, refcnt = 0;
6218
6219 for_each_possible_cpu(i)
6220 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6221 return refcnt;
6222}
6223EXPORT_SYMBOL(netdev_refcnt_read);
6224
2c53040f 6225/**
1da177e4 6226 * netdev_wait_allrefs - wait until all references are gone.
3de7a37b 6227 * @dev: target net_device
1da177e4
LT
6228 *
6229 * This is called when unregistering network devices.
6230 *
6231 * Any protocol or device that holds a reference should register
6232 * for netdevice notification, and cleanup and put back the
6233 * reference if they receive an UNREGISTER event.
6234 * We can get stuck here if buggy protocols don't correctly
4ec93edb 6235 * call dev_put.
1da177e4
LT
6236 */
6237static void netdev_wait_allrefs(struct net_device *dev)
6238{
6239 unsigned long rebroadcast_time, warning_time;
29b4433d 6240 int refcnt;
1da177e4 6241
e014debe
ED
6242 linkwatch_forget_dev(dev);
6243
1da177e4 6244 rebroadcast_time = warning_time = jiffies;
29b4433d
ED
6245 refcnt = netdev_refcnt_read(dev);
6246
6247 while (refcnt != 0) {
1da177e4 6248 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 6249 rtnl_lock();
1da177e4
LT
6250
6251 /* Rebroadcast unregister notification */
056925ab 6252 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
1da177e4 6253
748e2d93 6254 __rtnl_unlock();
0115e8e3 6255 rcu_barrier();
748e2d93
ED
6256 rtnl_lock();
6257
0115e8e3 6258 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
1da177e4
LT
6259 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6260 &dev->state)) {
6261 /* We must not have linkwatch events
6262 * pending on unregister. If this
6263 * happens, we simply run the queue
6264 * unscheduled, resulting in a noop
6265 * for this device.
6266 */
6267 linkwatch_run_queue();
6268 }
6269
6756ae4b 6270 __rtnl_unlock();
1da177e4
LT
6271
6272 rebroadcast_time = jiffies;
6273 }
6274
6275 msleep(250);
6276
29b4433d
ED
6277 refcnt = netdev_refcnt_read(dev);
6278
1da177e4 6279 if (time_after(jiffies, warning_time + 10 * HZ)) {
7b6cd1ce
JP
6280 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6281 dev->name, refcnt);
1da177e4
LT
6282 warning_time = jiffies;
6283 }
6284 }
6285}
6286
6287/* The sequence is:
6288 *
6289 * rtnl_lock();
6290 * ...
6291 * register_netdevice(x1);
6292 * register_netdevice(x2);
6293 * ...
6294 * unregister_netdevice(y1);
6295 * unregister_netdevice(y2);
6296 * ...
6297 * rtnl_unlock();
6298 * free_netdev(y1);
6299 * free_netdev(y2);
6300 *
58ec3b4d 6301 * We are invoked by rtnl_unlock().
1da177e4 6302 * This allows us to deal with problems:
b17a7c17 6303 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
6304 * without deadlocking with linkwatch via keventd.
6305 * 2) Since we run with the RTNL semaphore not held, we can sleep
6306 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
6307 *
6308 * We must not return until all unregister events added during
6309 * the interval the lock was held have been completed.
1da177e4 6310 */
1da177e4
LT
6311void netdev_run_todo(void)
6312{
626ab0e6 6313 struct list_head list;
1da177e4 6314
1da177e4 6315 /* Snapshot list, allow later requests */
626ab0e6 6316 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
6317
6318 __rtnl_unlock();
626ab0e6 6319
0115e8e3
ED
6320
6321 /* Wait for rcu callbacks to finish before next phase */
850a545b
EB
6322 if (!list_empty(&list))
6323 rcu_barrier();
6324
1da177e4
LT
6325 while (!list_empty(&list)) {
6326 struct net_device *dev
e5e26d75 6327 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
6328 list_del(&dev->todo_list);
6329
748e2d93 6330 rtnl_lock();
0115e8e3 6331 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
748e2d93 6332 __rtnl_unlock();
0115e8e3 6333
b17a7c17 6334 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7b6cd1ce 6335 pr_err("network todo '%s' but state %d\n",
b17a7c17
SH
6336 dev->name, dev->reg_state);
6337 dump_stack();
6338 continue;
6339 }
1da177e4 6340
b17a7c17 6341 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 6342
152102c7 6343 on_each_cpu(flush_backlog, dev, 1);
6e583ce5 6344
b17a7c17 6345 netdev_wait_allrefs(dev);
1da177e4 6346
b17a7c17 6347 /* paranoia */
29b4433d 6348 BUG_ON(netdev_refcnt_read(dev));
33d480ce
ED
6349 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6350 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
547b792c 6351 WARN_ON(dev->dn_ptr);
1da177e4 6352
b17a7c17
SH
6353 if (dev->destructor)
6354 dev->destructor(dev);
9093bbb2 6355
50624c93
EB
6356 /* Report a network device has been unregistered */
6357 rtnl_lock();
6358 dev_net(dev)->dev_unreg_count--;
6359 __rtnl_unlock();
6360 wake_up(&netdev_unregistering_wq);
6361
9093bbb2
SH
6362 /* Free network device */
6363 kobject_put(&dev->dev.kobj);
1da177e4 6364 }
1da177e4
LT
6365}
6366
3cfde79c
BH
6367/* Convert net_device_stats to rtnl_link_stats64. They have the same
6368 * fields in the same order, with only the type differing.
6369 */
77a1abf5
ED
6370void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6371 const struct net_device_stats *netdev_stats)
3cfde79c
BH
6372{
6373#if BITS_PER_LONG == 64
77a1abf5
ED
6374 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6375 memcpy(stats64, netdev_stats, sizeof(*stats64));
3cfde79c
BH
6376#else
6377 size_t i, n = sizeof(*stats64) / sizeof(u64);
6378 const unsigned long *src = (const unsigned long *)netdev_stats;
6379 u64 *dst = (u64 *)stats64;
6380
6381 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6382 sizeof(*stats64) / sizeof(u64));
6383 for (i = 0; i < n; i++)
6384 dst[i] = src[i];
6385#endif
6386}
77a1abf5 6387EXPORT_SYMBOL(netdev_stats_to_stats64);
3cfde79c 6388
eeda3fd6
SH
6389/**
6390 * dev_get_stats - get network device statistics
6391 * @dev: device to get statistics from
28172739 6392 * @storage: place to store stats
eeda3fd6 6393 *
d7753516
BH
6394 * Get network statistics from device. Return @storage.
6395 * The device driver may provide its own method by setting
6396 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6397 * otherwise the internal statistics structure is used.
eeda3fd6 6398 */
d7753516
BH
6399struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6400 struct rtnl_link_stats64 *storage)
7004bf25 6401{
eeda3fd6
SH
6402 const struct net_device_ops *ops = dev->netdev_ops;
6403
28172739
ED
6404 if (ops->ndo_get_stats64) {
6405 memset(storage, 0, sizeof(*storage));
caf586e5
ED
6406 ops->ndo_get_stats64(dev, storage);
6407 } else if (ops->ndo_get_stats) {
3cfde79c 6408 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
caf586e5
ED
6409 } else {
6410 netdev_stats_to_stats64(storage, &dev->stats);
28172739 6411 }
caf586e5 6412 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
015f0688 6413 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
28172739 6414 return storage;
c45d286e 6415}
eeda3fd6 6416EXPORT_SYMBOL(dev_get_stats);
c45d286e 6417
24824a09 6418struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
dc2b4847 6419{
24824a09 6420 struct netdev_queue *queue = dev_ingress_queue(dev);
dc2b4847 6421
24824a09
ED
6422#ifdef CONFIG_NET_CLS_ACT
6423 if (queue)
6424 return queue;
6425 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6426 if (!queue)
6427 return NULL;
6428 netdev_init_one_queue(dev, queue, NULL);
24824a09
ED
6429 queue->qdisc = &noop_qdisc;
6430 queue->qdisc_sleeping = &noop_qdisc;
6431 rcu_assign_pointer(dev->ingress_queue, queue);
6432#endif
6433 return queue;
bb949fbd
DM
6434}
6435
2c60db03
ED
6436static const struct ethtool_ops default_ethtool_ops;
6437
d07d7507
SG
6438void netdev_set_default_ethtool_ops(struct net_device *dev,
6439 const struct ethtool_ops *ops)
6440{
6441 if (dev->ethtool_ops == &default_ethtool_ops)
6442 dev->ethtool_ops = ops;
6443}
6444EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6445
74d332c1
ED
6446void netdev_freemem(struct net_device *dev)
6447{
6448 char *addr = (char *)dev - dev->padded;
6449
4cb28970 6450 kvfree(addr);
74d332c1
ED
6451}
6452
1da177e4 6453/**
36909ea4 6454 * alloc_netdev_mqs - allocate network device
c835a677
TG
6455 * @sizeof_priv: size of private data to allocate space for
6456 * @name: device name format string
6457 * @name_assign_type: origin of device name
6458 * @setup: callback to initialize device
6459 * @txqs: the number of TX subqueues to allocate
6460 * @rxqs: the number of RX subqueues to allocate
1da177e4
LT
6461 *
6462 * Allocates a struct net_device with private data area for driver use
90e51adf 6463 * and performs basic initialization. Also allocates subqueue structs
36909ea4 6464 * for each queue on the device.
1da177e4 6465 */
36909ea4 6466struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
c835a677 6467 unsigned char name_assign_type,
36909ea4
TH
6468 void (*setup)(struct net_device *),
6469 unsigned int txqs, unsigned int rxqs)
1da177e4 6470{
1da177e4 6471 struct net_device *dev;
7943986c 6472 size_t alloc_size;
1ce8e7b5 6473 struct net_device *p;
1da177e4 6474
b6fe17d6
SH
6475 BUG_ON(strlen(name) >= sizeof(dev->name));
6476
36909ea4 6477 if (txqs < 1) {
7b6cd1ce 6478 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
55513fb4
TH
6479 return NULL;
6480 }
6481
a953be53 6482#ifdef CONFIG_SYSFS
36909ea4 6483 if (rxqs < 1) {
7b6cd1ce 6484 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
36909ea4
TH
6485 return NULL;
6486 }
6487#endif
6488
fd2ea0a7 6489 alloc_size = sizeof(struct net_device);
d1643d24
AD
6490 if (sizeof_priv) {
6491 /* ensure 32-byte alignment of private area */
1ce8e7b5 6492 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
6493 alloc_size += sizeof_priv;
6494 }
6495 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 6496 alloc_size += NETDEV_ALIGN - 1;
1da177e4 6497
74d332c1
ED
6498 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6499 if (!p)
6500 p = vzalloc(alloc_size);
62b5942a 6501 if (!p)
1da177e4 6502 return NULL;
1da177e4 6503
1ce8e7b5 6504 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 6505 dev->padded = (char *)dev - (char *)p;
ab9c73cc 6506
29b4433d
ED
6507 dev->pcpu_refcnt = alloc_percpu(int);
6508 if (!dev->pcpu_refcnt)
74d332c1 6509 goto free_dev;
ab9c73cc 6510
ab9c73cc 6511 if (dev_addr_init(dev))
29b4433d 6512 goto free_pcpu;
ab9c73cc 6513
22bedad3 6514 dev_mc_init(dev);
a748ee24 6515 dev_uc_init(dev);
ccffad25 6516
c346dca1 6517 dev_net_set(dev, &init_net);
1da177e4 6518
8d3bdbd5 6519 dev->gso_max_size = GSO_MAX_SIZE;
30b678d8 6520 dev->gso_max_segs = GSO_MAX_SEGS;
8d3bdbd5 6521
8d3bdbd5
DM
6522 INIT_LIST_HEAD(&dev->napi_list);
6523 INIT_LIST_HEAD(&dev->unreg_list);
5cde2829 6524 INIT_LIST_HEAD(&dev->close_list);
8d3bdbd5 6525 INIT_LIST_HEAD(&dev->link_watch_list);
2f268f12
VF
6526 INIT_LIST_HEAD(&dev->adj_list.upper);
6527 INIT_LIST_HEAD(&dev->adj_list.lower);
6528 INIT_LIST_HEAD(&dev->all_adj_list.upper);
6529 INIT_LIST_HEAD(&dev->all_adj_list.lower);
8d3bdbd5
DM
6530 dev->priv_flags = IFF_XMIT_DST_RELEASE;
6531 setup(dev);
6532
36909ea4
TH
6533 dev->num_tx_queues = txqs;
6534 dev->real_num_tx_queues = txqs;
ed9af2e8 6535 if (netif_alloc_netdev_queues(dev))
8d3bdbd5 6536 goto free_all;
e8a0464c 6537
a953be53 6538#ifdef CONFIG_SYSFS
36909ea4
TH
6539 dev->num_rx_queues = rxqs;
6540 dev->real_num_rx_queues = rxqs;
fe822240 6541 if (netif_alloc_rx_queues(dev))
8d3bdbd5 6542 goto free_all;
df334545 6543#endif
0a9627f2 6544
1da177e4 6545 strcpy(dev->name, name);
c835a677 6546 dev->name_assign_type = name_assign_type;
cbda10fa 6547 dev->group = INIT_NETDEV_GROUP;
2c60db03
ED
6548 if (!dev->ethtool_ops)
6549 dev->ethtool_ops = &default_ethtool_ops;
1da177e4 6550 return dev;
ab9c73cc 6551
8d3bdbd5
DM
6552free_all:
6553 free_netdev(dev);
6554 return NULL;
6555
29b4433d
ED
6556free_pcpu:
6557 free_percpu(dev->pcpu_refcnt);
74d332c1
ED
6558free_dev:
6559 netdev_freemem(dev);
ab9c73cc 6560 return NULL;
1da177e4 6561}
36909ea4 6562EXPORT_SYMBOL(alloc_netdev_mqs);
1da177e4
LT
6563
6564/**
6565 * free_netdev - free network device
6566 * @dev: device
6567 *
4ec93edb
YH
6568 * This function does the last stage of destroying an allocated device
6569 * interface. The reference to the device object is released.
1da177e4
LT
6570 * If this is the last reference then it will be freed.
6571 */
6572void free_netdev(struct net_device *dev)
6573{
d565b0a1
HX
6574 struct napi_struct *p, *n;
6575
f3005d7f
DL
6576 release_net(dev_net(dev));
6577
60877a32 6578 netif_free_tx_queues(dev);
a953be53 6579#ifdef CONFIG_SYSFS
fe822240
TH
6580 kfree(dev->_rx);
6581#endif
e8a0464c 6582
33d480ce 6583 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
24824a09 6584
f001fde5
JP
6585 /* Flush device addresses */
6586 dev_addr_flush(dev);
6587
d565b0a1
HX
6588 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6589 netif_napi_del(p);
6590
29b4433d
ED
6591 free_percpu(dev->pcpu_refcnt);
6592 dev->pcpu_refcnt = NULL;
6593
3041a069 6594 /* Compatibility with error handling in drivers */
1da177e4 6595 if (dev->reg_state == NETREG_UNINITIALIZED) {
74d332c1 6596 netdev_freemem(dev);
1da177e4
LT
6597 return;
6598 }
6599
6600 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6601 dev->reg_state = NETREG_RELEASED;
6602
43cb76d9
GKH
6603 /* will free via device release */
6604 put_device(&dev->dev);
1da177e4 6605}
d1b19dff 6606EXPORT_SYMBOL(free_netdev);
4ec93edb 6607
f0db275a
SH
6608/**
6609 * synchronize_net - Synchronize with packet receive processing
6610 *
6611 * Wait for packets currently being received to be done.
6612 * Does not block later packets from starting.
6613 */
4ec93edb 6614void synchronize_net(void)
1da177e4
LT
6615{
6616 might_sleep();
be3fc413
ED
6617 if (rtnl_is_locked())
6618 synchronize_rcu_expedited();
6619 else
6620 synchronize_rcu();
1da177e4 6621}
d1b19dff 6622EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
6623
6624/**
44a0873d 6625 * unregister_netdevice_queue - remove device from the kernel
1da177e4 6626 * @dev: device
44a0873d 6627 * @head: list
6ebfbc06 6628 *
1da177e4 6629 * This function shuts down a device interface and removes it
d59b54b1 6630 * from the kernel tables.
44a0873d 6631 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
6632 *
6633 * Callers must hold the rtnl semaphore. You may want
6634 * unregister_netdev() instead of this.
6635 */
6636
44a0873d 6637void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 6638{
a6620712
HX
6639 ASSERT_RTNL();
6640
44a0873d 6641 if (head) {
9fdce099 6642 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
6643 } else {
6644 rollback_registered(dev);
6645 /* Finish processing unregister after unlock */
6646 net_set_todo(dev);
6647 }
1da177e4 6648}
44a0873d 6649EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 6650
9b5e383c
ED
6651/**
6652 * unregister_netdevice_many - unregister many devices
6653 * @head: list of devices
87757a91
ED
6654 *
6655 * Note: As most callers use a stack allocated list_head,
6656 * we force a list_del() to make sure stack wont be corrupted later.
9b5e383c
ED
6657 */
6658void unregister_netdevice_many(struct list_head *head)
6659{
6660 struct net_device *dev;
6661
6662 if (!list_empty(head)) {
6663 rollback_registered_many(head);
6664 list_for_each_entry(dev, head, unreg_list)
6665 net_set_todo(dev);
87757a91 6666 list_del(head);
9b5e383c
ED
6667 }
6668}
63c8099d 6669EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 6670
1da177e4
LT
6671/**
6672 * unregister_netdev - remove device from the kernel
6673 * @dev: device
6674 *
6675 * This function shuts down a device interface and removes it
d59b54b1 6676 * from the kernel tables.
1da177e4
LT
6677 *
6678 * This is just a wrapper for unregister_netdevice that takes
6679 * the rtnl semaphore. In general you want to use this and not
6680 * unregister_netdevice.
6681 */
6682void unregister_netdev(struct net_device *dev)
6683{
6684 rtnl_lock();
6685 unregister_netdevice(dev);
6686 rtnl_unlock();
6687}
1da177e4
LT
6688EXPORT_SYMBOL(unregister_netdev);
6689
ce286d32
EB
6690/**
6691 * dev_change_net_namespace - move device to different nethost namespace
6692 * @dev: device
6693 * @net: network namespace
6694 * @pat: If not NULL name pattern to try if the current device name
6695 * is already taken in the destination network namespace.
6696 *
6697 * This function shuts down a device interface and moves it
6698 * to a new network namespace. On success 0 is returned, on
6699 * a failure a netagive errno code is returned.
6700 *
6701 * Callers must hold the rtnl semaphore.
6702 */
6703
6704int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6705{
ce286d32
EB
6706 int err;
6707
6708 ASSERT_RTNL();
6709
6710 /* Don't allow namespace local devices to be moved. */
6711 err = -EINVAL;
6712 if (dev->features & NETIF_F_NETNS_LOCAL)
6713 goto out;
6714
6715 /* Ensure the device has been registrered */
ce286d32
EB
6716 if (dev->reg_state != NETREG_REGISTERED)
6717 goto out;
6718
6719 /* Get out if there is nothing todo */
6720 err = 0;
878628fb 6721 if (net_eq(dev_net(dev), net))
ce286d32
EB
6722 goto out;
6723
6724 /* Pick the destination device name, and ensure
6725 * we can use it in the destination network namespace.
6726 */
6727 err = -EEXIST;
d9031024 6728 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
6729 /* We get here if we can't use the current device name */
6730 if (!pat)
6731 goto out;
828de4f6 6732 if (dev_get_valid_name(net, dev, pat) < 0)
ce286d32
EB
6733 goto out;
6734 }
6735
6736 /*
6737 * And now a mini version of register_netdevice unregister_netdevice.
6738 */
6739
6740 /* If device is running close it first. */
9b772652 6741 dev_close(dev);
ce286d32
EB
6742
6743 /* And unlink it from device chain */
6744 err = -ENODEV;
6745 unlist_netdevice(dev);
6746
6747 synchronize_net();
6748
6749 /* Shutdown queueing discipline. */
6750 dev_shutdown(dev);
6751
6752 /* Notify protocols, that we are about to destroy
6753 this device. They should clean all the things.
3b27e105
DL
6754
6755 Note that dev->reg_state stays at NETREG_REGISTERED.
6756 This is wanted because this way 8021q and macvlan know
6757 the device is just moving and can keep their slaves up.
ce286d32
EB
6758 */
6759 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6549dd43
G
6760 rcu_barrier();
6761 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7f294054 6762 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
ce286d32
EB
6763
6764 /*
6765 * Flush the unicast and multicast chains
6766 */
a748ee24 6767 dev_uc_flush(dev);
22bedad3 6768 dev_mc_flush(dev);
ce286d32 6769
4e66ae2e
SH
6770 /* Send a netdev-removed uevent to the old namespace */
6771 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6772
ce286d32 6773 /* Actually switch the network namespace */
c346dca1 6774 dev_net_set(dev, net);
ce286d32 6775
ce286d32
EB
6776 /* If there is an ifindex conflict assign a new one */
6777 if (__dev_get_by_index(net, dev->ifindex)) {
6778 int iflink = (dev->iflink == dev->ifindex);
6779 dev->ifindex = dev_new_index(net);
6780 if (iflink)
6781 dev->iflink = dev->ifindex;
6782 }
6783
4e66ae2e
SH
6784 /* Send a netdev-add uevent to the new namespace */
6785 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6786
8b41d188 6787 /* Fixup kobjects */
a1b3f594 6788 err = device_rename(&dev->dev, dev->name);
8b41d188 6789 WARN_ON(err);
ce286d32
EB
6790
6791 /* Add the device back in the hashes */
6792 list_netdevice(dev);
6793
6794 /* Notify protocols, that a new device appeared. */
6795 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6796
d90a909e
EB
6797 /*
6798 * Prevent userspace races by waiting until the network
6799 * device is fully setup before sending notifications.
6800 */
7f294054 6801 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
d90a909e 6802
ce286d32
EB
6803 synchronize_net();
6804 err = 0;
6805out:
6806 return err;
6807}
463d0183 6808EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 6809
1da177e4
LT
6810static int dev_cpu_callback(struct notifier_block *nfb,
6811 unsigned long action,
6812 void *ocpu)
6813{
6814 struct sk_buff **list_skb;
1da177e4
LT
6815 struct sk_buff *skb;
6816 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6817 struct softnet_data *sd, *oldsd;
6818
8bb78442 6819 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
6820 return NOTIFY_OK;
6821
6822 local_irq_disable();
6823 cpu = smp_processor_id();
6824 sd = &per_cpu(softnet_data, cpu);
6825 oldsd = &per_cpu(softnet_data, oldcpu);
6826
6827 /* Find end of our completion_queue. */
6828 list_skb = &sd->completion_queue;
6829 while (*list_skb)
6830 list_skb = &(*list_skb)->next;
6831 /* Append completion queue from offline CPU. */
6832 *list_skb = oldsd->completion_queue;
6833 oldsd->completion_queue = NULL;
6834
1da177e4 6835 /* Append output queue from offline CPU. */
a9cbd588
CG
6836 if (oldsd->output_queue) {
6837 *sd->output_queue_tailp = oldsd->output_queue;
6838 sd->output_queue_tailp = oldsd->output_queue_tailp;
6839 oldsd->output_queue = NULL;
6840 oldsd->output_queue_tailp = &oldsd->output_queue;
6841 }
264524d5
HC
6842 /* Append NAPI poll list from offline CPU. */
6843 if (!list_empty(&oldsd->poll_list)) {
6844 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6845 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6846 }
1da177e4
LT
6847
6848 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6849 local_irq_enable();
6850
6851 /* Process offline CPU's input_pkt_queue */
76cc8b13 6852 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
ae78dbfa 6853 netif_rx_internal(skb);
76cc8b13 6854 input_queue_head_incr(oldsd);
fec5e652 6855 }
76cc8b13 6856 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
ae78dbfa 6857 netif_rx_internal(skb);
76cc8b13
TH
6858 input_queue_head_incr(oldsd);
6859 }
1da177e4
LT
6860
6861 return NOTIFY_OK;
6862}
1da177e4
LT
6863
6864
7f353bf2 6865/**
b63365a2
HX
6866 * netdev_increment_features - increment feature set by one
6867 * @all: current feature set
6868 * @one: new feature set
6869 * @mask: mask feature set
7f353bf2
HX
6870 *
6871 * Computes a new feature set after adding a device with feature set
b63365a2
HX
6872 * @one to the master device with current feature set @all. Will not
6873 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 6874 */
c8f44aff
MM
6875netdev_features_t netdev_increment_features(netdev_features_t all,
6876 netdev_features_t one, netdev_features_t mask)
b63365a2 6877{
1742f183
MM
6878 if (mask & NETIF_F_GEN_CSUM)
6879 mask |= NETIF_F_ALL_CSUM;
6880 mask |= NETIF_F_VLAN_CHALLENGED;
7f353bf2 6881
1742f183
MM
6882 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6883 all &= one | ~NETIF_F_ALL_FOR_ALL;
c6e1a0d1 6884
1742f183
MM
6885 /* If one device supports hw checksumming, set for all. */
6886 if (all & NETIF_F_GEN_CSUM)
6887 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7f353bf2
HX
6888
6889 return all;
6890}
b63365a2 6891EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 6892
430f03cd 6893static struct hlist_head * __net_init netdev_create_hash(void)
30d97d35
PE
6894{
6895 int i;
6896 struct hlist_head *hash;
6897
6898 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6899 if (hash != NULL)
6900 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6901 INIT_HLIST_HEAD(&hash[i]);
6902
6903 return hash;
6904}
6905
881d966b 6906/* Initialize per network namespace state */
4665079c 6907static int __net_init netdev_init(struct net *net)
881d966b 6908{
734b6541
RM
6909 if (net != &init_net)
6910 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 6911
30d97d35
PE
6912 net->dev_name_head = netdev_create_hash();
6913 if (net->dev_name_head == NULL)
6914 goto err_name;
881d966b 6915
30d97d35
PE
6916 net->dev_index_head = netdev_create_hash();
6917 if (net->dev_index_head == NULL)
6918 goto err_idx;
881d966b
EB
6919
6920 return 0;
30d97d35
PE
6921
6922err_idx:
6923 kfree(net->dev_name_head);
6924err_name:
6925 return -ENOMEM;
881d966b
EB
6926}
6927
f0db275a
SH
6928/**
6929 * netdev_drivername - network driver for the device
6930 * @dev: network device
f0db275a
SH
6931 *
6932 * Determine network driver for device.
6933 */
3019de12 6934const char *netdev_drivername(const struct net_device *dev)
6579e57b 6935{
cf04a4c7
SH
6936 const struct device_driver *driver;
6937 const struct device *parent;
3019de12 6938 const char *empty = "";
6579e57b
AV
6939
6940 parent = dev->dev.parent;
6579e57b 6941 if (!parent)
3019de12 6942 return empty;
6579e57b
AV
6943
6944 driver = parent->driver;
6945 if (driver && driver->name)
3019de12
DM
6946 return driver->name;
6947 return empty;
6579e57b
AV
6948}
6949
b004ff49 6950static int __netdev_printk(const char *level, const struct net_device *dev,
256df2f3
JP
6951 struct va_format *vaf)
6952{
6953 int r;
6954
b004ff49 6955 if (dev && dev->dev.parent) {
666f355f
JP
6956 r = dev_printk_emit(level[1] - '0',
6957 dev->dev.parent,
ccc7f496 6958 "%s %s %s%s: %pV",
666f355f
JP
6959 dev_driver_string(dev->dev.parent),
6960 dev_name(dev->dev.parent),
ccc7f496
VF
6961 netdev_name(dev), netdev_reg_state(dev),
6962 vaf);
b004ff49 6963 } else if (dev) {
ccc7f496
VF
6964 r = printk("%s%s%s: %pV", level, netdev_name(dev),
6965 netdev_reg_state(dev), vaf);
b004ff49 6966 } else {
256df2f3 6967 r = printk("%s(NULL net_device): %pV", level, vaf);
b004ff49 6968 }
256df2f3
JP
6969
6970 return r;
6971}
6972
6973int netdev_printk(const char *level, const struct net_device *dev,
6974 const char *format, ...)
6975{
6976 struct va_format vaf;
6977 va_list args;
6978 int r;
6979
6980 va_start(args, format);
6981
6982 vaf.fmt = format;
6983 vaf.va = &args;
6984
6985 r = __netdev_printk(level, dev, &vaf);
b004ff49 6986
256df2f3
JP
6987 va_end(args);
6988
6989 return r;
6990}
6991EXPORT_SYMBOL(netdev_printk);
6992
6993#define define_netdev_printk_level(func, level) \
6994int func(const struct net_device *dev, const char *fmt, ...) \
6995{ \
6996 int r; \
6997 struct va_format vaf; \
6998 va_list args; \
6999 \
7000 va_start(args, fmt); \
7001 \
7002 vaf.fmt = fmt; \
7003 vaf.va = &args; \
7004 \
7005 r = __netdev_printk(level, dev, &vaf); \
b004ff49 7006 \
256df2f3
JP
7007 va_end(args); \
7008 \
7009 return r; \
7010} \
7011EXPORT_SYMBOL(func);
7012
7013define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7014define_netdev_printk_level(netdev_alert, KERN_ALERT);
7015define_netdev_printk_level(netdev_crit, KERN_CRIT);
7016define_netdev_printk_level(netdev_err, KERN_ERR);
7017define_netdev_printk_level(netdev_warn, KERN_WARNING);
7018define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7019define_netdev_printk_level(netdev_info, KERN_INFO);
7020
4665079c 7021static void __net_exit netdev_exit(struct net *net)
881d966b
EB
7022{
7023 kfree(net->dev_name_head);
7024 kfree(net->dev_index_head);
7025}
7026
022cbae6 7027static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
7028 .init = netdev_init,
7029 .exit = netdev_exit,
7030};
7031
4665079c 7032static void __net_exit default_device_exit(struct net *net)
ce286d32 7033{
e008b5fc 7034 struct net_device *dev, *aux;
ce286d32 7035 /*
e008b5fc 7036 * Push all migratable network devices back to the
ce286d32
EB
7037 * initial network namespace
7038 */
7039 rtnl_lock();
e008b5fc 7040 for_each_netdev_safe(net, dev, aux) {
ce286d32 7041 int err;
aca51397 7042 char fb_name[IFNAMSIZ];
ce286d32
EB
7043
7044 /* Ignore unmoveable devices (i.e. loopback) */
7045 if (dev->features & NETIF_F_NETNS_LOCAL)
7046 continue;
7047
e008b5fc
EB
7048 /* Leave virtual devices for the generic cleanup */
7049 if (dev->rtnl_link_ops)
7050 continue;
d0c082ce 7051
25985edc 7052 /* Push remaining network devices to init_net */
aca51397
PE
7053 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7054 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 7055 if (err) {
7b6cd1ce
JP
7056 pr_emerg("%s: failed to move %s to init_net: %d\n",
7057 __func__, dev->name, err);
aca51397 7058 BUG();
ce286d32
EB
7059 }
7060 }
7061 rtnl_unlock();
7062}
7063
50624c93
EB
7064static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7065{
7066 /* Return with the rtnl_lock held when there are no network
7067 * devices unregistering in any network namespace in net_list.
7068 */
7069 struct net *net;
7070 bool unregistering;
7071 DEFINE_WAIT(wait);
7072
7073 for (;;) {
7074 prepare_to_wait(&netdev_unregistering_wq, &wait,
7075 TASK_UNINTERRUPTIBLE);
7076 unregistering = false;
7077 rtnl_lock();
7078 list_for_each_entry(net, net_list, exit_list) {
7079 if (net->dev_unreg_count > 0) {
7080 unregistering = true;
7081 break;
7082 }
7083 }
7084 if (!unregistering)
7085 break;
7086 __rtnl_unlock();
7087 schedule();
7088 }
7089 finish_wait(&netdev_unregistering_wq, &wait);
7090}
7091
04dc7f6b
EB
7092static void __net_exit default_device_exit_batch(struct list_head *net_list)
7093{
7094 /* At exit all network devices most be removed from a network
b595076a 7095 * namespace. Do this in the reverse order of registration.
04dc7f6b
EB
7096 * Do this across as many network namespaces as possible to
7097 * improve batching efficiency.
7098 */
7099 struct net_device *dev;
7100 struct net *net;
7101 LIST_HEAD(dev_kill_list);
7102
50624c93
EB
7103 /* To prevent network device cleanup code from dereferencing
7104 * loopback devices or network devices that have been freed
7105 * wait here for all pending unregistrations to complete,
7106 * before unregistring the loopback device and allowing the
7107 * network namespace be freed.
7108 *
7109 * The netdev todo list containing all network devices
7110 * unregistrations that happen in default_device_exit_batch
7111 * will run in the rtnl_unlock() at the end of
7112 * default_device_exit_batch.
7113 */
7114 rtnl_lock_unregistering(net_list);
04dc7f6b
EB
7115 list_for_each_entry(net, net_list, exit_list) {
7116 for_each_netdev_reverse(net, dev) {
b0ab2fab 7117 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
04dc7f6b
EB
7118 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7119 else
7120 unregister_netdevice_queue(dev, &dev_kill_list);
7121 }
7122 }
7123 unregister_netdevice_many(&dev_kill_list);
7124 rtnl_unlock();
7125}
7126
022cbae6 7127static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 7128 .exit = default_device_exit,
04dc7f6b 7129 .exit_batch = default_device_exit_batch,
ce286d32
EB
7130};
7131
1da177e4
LT
7132/*
7133 * Initialize the DEV module. At boot time this walks the device list and
7134 * unhooks any devices that fail to initialise (normally hardware not
7135 * present) and leaves us with a valid list of present and active devices.
7136 *
7137 */
7138
7139/*
7140 * This is called single threaded during boot, so no need
7141 * to take the rtnl semaphore.
7142 */
7143static int __init net_dev_init(void)
7144{
7145 int i, rc = -ENOMEM;
7146
7147 BUG_ON(!dev_boot_phase);
7148
1da177e4
LT
7149 if (dev_proc_init())
7150 goto out;
7151
8b41d188 7152 if (netdev_kobject_init())
1da177e4
LT
7153 goto out;
7154
7155 INIT_LIST_HEAD(&ptype_all);
82d8a867 7156 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
7157 INIT_LIST_HEAD(&ptype_base[i]);
7158
62532da9
VY
7159 INIT_LIST_HEAD(&offload_base);
7160
881d966b
EB
7161 if (register_pernet_subsys(&netdev_net_ops))
7162 goto out;
1da177e4
LT
7163
7164 /*
7165 * Initialise the packet receive queues.
7166 */
7167
6f912042 7168 for_each_possible_cpu(i) {
e36fa2f7 7169 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 7170
e36fa2f7 7171 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 7172 skb_queue_head_init(&sd->process_queue);
e36fa2f7 7173 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588 7174 sd->output_queue_tailp = &sd->output_queue;
df334545 7175#ifdef CONFIG_RPS
e36fa2f7
ED
7176 sd->csd.func = rps_trigger_softirq;
7177 sd->csd.info = sd;
e36fa2f7 7178 sd->cpu = i;
1e94d72f 7179#endif
0a9627f2 7180
e36fa2f7
ED
7181 sd->backlog.poll = process_backlog;
7182 sd->backlog.weight = weight_p;
1da177e4
LT
7183 }
7184
1da177e4
LT
7185 dev_boot_phase = 0;
7186
505d4f73
EB
7187 /* The loopback device is special if any other network devices
7188 * is present in a network namespace the loopback device must
7189 * be present. Since we now dynamically allocate and free the
7190 * loopback device ensure this invariant is maintained by
7191 * keeping the loopback device as the first device on the
7192 * list of network devices. Ensuring the loopback devices
7193 * is the first device that appears and the last network device
7194 * that disappears.
7195 */
7196 if (register_pernet_device(&loopback_net_ops))
7197 goto out;
7198
7199 if (register_pernet_device(&default_device_ops))
7200 goto out;
7201
962cf36c
CM
7202 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7203 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
7204
7205 hotcpu_notifier(dev_cpu_callback, 0);
7206 dst_init();
1da177e4
LT
7207 rc = 0;
7208out:
7209 return rc;
7210}
7211
7212subsys_initcall(net_dev_init);
This page took 1.706739 seconds and 5 git commands to generate.