rocker: forward packets to CPU when port is joined to openvswitch
[deliverable/linux.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
1da177e4 76#include <linux/bitops.h>
4fc268d2 77#include <linux/capability.h>
1da177e4
LT
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
08e9897d 81#include <linux/hash.h>
5a0e3ad6 82#include <linux/slab.h>
1da177e4 83#include <linux/sched.h>
4a3e2f71 84#include <linux/mutex.h>
1da177e4
LT
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
0187bdfb 94#include <linux/ethtool.h>
1da177e4
LT
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
1da177e4 100#include <linux/stat.h>
1da177e4
LT
101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
44540960 104#include <net/xfrm.h>
1da177e4
LT
105#include <linux/highmem.h>
106#include <linux/init.h>
1da177e4 107#include <linux/module.h>
1da177e4
LT
108#include <linux/netpoll.h>
109#include <linux/rcupdate.h>
110#include <linux/delay.h>
1da177e4 111#include <net/iw_handler.h>
1da177e4 112#include <asm/current.h>
5bdb9886 113#include <linux/audit.h>
db217334 114#include <linux/dmaengine.h>
f6a78bfc 115#include <linux/err.h>
c7fa9d18 116#include <linux/ctype.h>
723e98b7 117#include <linux/if_arp.h>
6de329e2 118#include <linux/if_vlan.h>
8f0f2223 119#include <linux/ip.h>
ad55dcaf 120#include <net/ip.h>
25cd9ba0 121#include <net/mpls.h>
8f0f2223
DM
122#include <linux/ipv6.h>
123#include <linux/in.h>
b6b2fed1
DM
124#include <linux/jhash.h>
125#include <linux/random.h>
9cbc1cb8 126#include <trace/events/napi.h>
cf66ba58 127#include <trace/events/net.h>
07dc22e7 128#include <trace/events/skb.h>
5acbbd42 129#include <linux/pci.h>
caeda9b9 130#include <linux/inetdevice.h>
c445477d 131#include <linux/cpu_rmap.h>
c5905afb 132#include <linux/static_key.h>
af12fa6e 133#include <linux/hashtable.h>
60877a32 134#include <linux/vmalloc.h>
529d0489 135#include <linux/if_macvlan.h>
e7fd2885 136#include <linux/errqueue.h>
3b47d303 137#include <linux/hrtimer.h>
e687ad60 138#include <linux/netfilter_ingress.h>
1da177e4 139
342709ef
PE
140#include "net-sysfs.h"
141
d565b0a1
HX
142/* Instead of increasing this, you should create a hash table. */
143#define MAX_GRO_SKBS 8
144
5d38a079
HX
145/* This should be increased if a protocol with a bigger head is added. */
146#define GRO_MAX_HEAD (MAX_HEADER + 128)
147
1da177e4 148static DEFINE_SPINLOCK(ptype_lock);
62532da9 149static DEFINE_SPINLOCK(offload_lock);
900ff8c6
CW
150struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
151struct list_head ptype_all __read_mostly; /* Taps */
62532da9 152static struct list_head offload_base __read_mostly;
1da177e4 153
ae78dbfa 154static int netif_rx_internal(struct sk_buff *skb);
54951194
LP
155static int call_netdevice_notifiers_info(unsigned long val,
156 struct net_device *dev,
157 struct netdev_notifier_info *info);
ae78dbfa 158
1da177e4 159/*
7562f876 160 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
161 * semaphore.
162 *
c6d14c84 163 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
164 *
165 * Writers must hold the rtnl semaphore while they loop through the
7562f876 166 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
167 * actual updates. This allows pure readers to access the list even
168 * while a writer is preparing to update it.
169 *
170 * To put it another way, dev_base_lock is held for writing only to
171 * protect against pure readers; the rtnl semaphore provides the
172 * protection against other writers.
173 *
174 * See, for example usages, register_netdevice() and
175 * unregister_netdevice(), which must be called with the rtnl
176 * semaphore held.
177 */
1da177e4 178DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
179EXPORT_SYMBOL(dev_base_lock);
180
af12fa6e
ET
181/* protects napi_hash addition/deletion and napi_gen_id */
182static DEFINE_SPINLOCK(napi_hash_lock);
183
184static unsigned int napi_gen_id;
185static DEFINE_HASHTABLE(napi_hash, 8);
186
18afa4b0 187static seqcount_t devnet_rename_seq;
c91f6df2 188
4e985ada
TG
189static inline void dev_base_seq_inc(struct net *net)
190{
191 while (++net->dev_base_seq == 0);
192}
193
881d966b 194static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4 195{
95c96174
ED
196 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
197
08e9897d 198 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
199}
200
881d966b 201static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 202{
7c28bd0b 203 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
204}
205
e36fa2f7 206static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
207{
208#ifdef CONFIG_RPS
e36fa2f7 209 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
210#endif
211}
212
e36fa2f7 213static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
214{
215#ifdef CONFIG_RPS
e36fa2f7 216 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
217#endif
218}
219
ce286d32 220/* Device list insertion */
53759be9 221static void list_netdevice(struct net_device *dev)
ce286d32 222{
c346dca1 223 struct net *net = dev_net(dev);
ce286d32
EB
224
225 ASSERT_RTNL();
226
227 write_lock_bh(&dev_base_lock);
c6d14c84 228 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 229 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
230 hlist_add_head_rcu(&dev->index_hlist,
231 dev_index_hash(net, dev->ifindex));
ce286d32 232 write_unlock_bh(&dev_base_lock);
4e985ada
TG
233
234 dev_base_seq_inc(net);
ce286d32
EB
235}
236
fb699dfd
ED
237/* Device list removal
238 * caller must respect a RCU grace period before freeing/reusing dev
239 */
ce286d32
EB
240static void unlist_netdevice(struct net_device *dev)
241{
242 ASSERT_RTNL();
243
244 /* Unlink dev from the device chain */
245 write_lock_bh(&dev_base_lock);
c6d14c84 246 list_del_rcu(&dev->dev_list);
72c9528b 247 hlist_del_rcu(&dev->name_hlist);
fb699dfd 248 hlist_del_rcu(&dev->index_hlist);
ce286d32 249 write_unlock_bh(&dev_base_lock);
4e985ada
TG
250
251 dev_base_seq_inc(dev_net(dev));
ce286d32
EB
252}
253
1da177e4
LT
254/*
255 * Our notifier list
256 */
257
f07d5b94 258static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
259
260/*
261 * Device drivers call our routines to queue packets here. We empty the
262 * queue in the local softnet handler.
263 */
bea3348e 264
9958da05 265DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 266EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 267
cf508b12 268#ifdef CONFIG_LOCKDEP
723e98b7 269/*
c773e847 270 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
271 * according to dev->type
272 */
273static const unsigned short netdev_lock_type[] =
274 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
275 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
276 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
277 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
278 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
279 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
280 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
281 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
282 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
283 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
284 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
285 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
211ed865
PG
286 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
287 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
288 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
723e98b7 289
36cbd3dc 290static const char *const netdev_lock_name[] =
723e98b7
JP
291 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
292 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
293 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
294 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
295 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
296 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
297 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
298 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
299 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
300 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
301 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
302 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
211ed865
PG
303 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
304 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
305 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
306
307static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 308static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
309
310static inline unsigned short netdev_lock_pos(unsigned short dev_type)
311{
312 int i;
313
314 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
315 if (netdev_lock_type[i] == dev_type)
316 return i;
317 /* the last key is used by default */
318 return ARRAY_SIZE(netdev_lock_type) - 1;
319}
320
cf508b12
DM
321static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
322 unsigned short dev_type)
723e98b7
JP
323{
324 int i;
325
326 i = netdev_lock_pos(dev_type);
327 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
328 netdev_lock_name[i]);
329}
cf508b12
DM
330
331static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
332{
333 int i;
334
335 i = netdev_lock_pos(dev->type);
336 lockdep_set_class_and_name(&dev->addr_list_lock,
337 &netdev_addr_lock_key[i],
338 netdev_lock_name[i]);
339}
723e98b7 340#else
cf508b12
DM
341static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
342 unsigned short dev_type)
343{
344}
345static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
346{
347}
348#endif
1da177e4
LT
349
350/*******************************************************************************
351
352 Protocol management and registration routines
353
354*******************************************************************************/
355
1da177e4
LT
356/*
357 * Add a protocol ID to the list. Now that the input handler is
358 * smarter we can dispense with all the messy stuff that used to be
359 * here.
360 *
361 * BEWARE!!! Protocol handlers, mangling input packets,
362 * MUST BE last in hash buckets and checking protocol handlers
363 * MUST start from promiscuous ptype_all chain in net_bh.
364 * It is true now, do not change it.
365 * Explanation follows: if protocol handler, mangling packet, will
366 * be the first on list, it is not able to sense, that packet
367 * is cloned and should be copied-on-write, so that it will
368 * change it and subsequent readers will get broken packet.
369 * --ANK (980803)
370 */
371
c07b68e8
ED
372static inline struct list_head *ptype_head(const struct packet_type *pt)
373{
374 if (pt->type == htons(ETH_P_ALL))
7866a621 375 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
c07b68e8 376 else
7866a621
SN
377 return pt->dev ? &pt->dev->ptype_specific :
378 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
c07b68e8
ED
379}
380
1da177e4
LT
381/**
382 * dev_add_pack - add packet handler
383 * @pt: packet type declaration
384 *
385 * Add a protocol handler to the networking stack. The passed &packet_type
386 * is linked into kernel lists and may not be freed until it has been
387 * removed from the kernel lists.
388 *
4ec93edb 389 * This call does not sleep therefore it can not
1da177e4
LT
390 * guarantee all CPU's that are in middle of receiving packets
391 * will see the new packet type (until the next received packet).
392 */
393
394void dev_add_pack(struct packet_type *pt)
395{
c07b68e8 396 struct list_head *head = ptype_head(pt);
1da177e4 397
c07b68e8
ED
398 spin_lock(&ptype_lock);
399 list_add_rcu(&pt->list, head);
400 spin_unlock(&ptype_lock);
1da177e4 401}
d1b19dff 402EXPORT_SYMBOL(dev_add_pack);
1da177e4 403
1da177e4
LT
404/**
405 * __dev_remove_pack - remove packet handler
406 * @pt: packet type declaration
407 *
408 * Remove a protocol handler that was previously added to the kernel
409 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
410 * from the kernel lists and can be freed or reused once this function
4ec93edb 411 * returns.
1da177e4
LT
412 *
413 * The packet type might still be in use by receivers
414 * and must not be freed until after all the CPU's have gone
415 * through a quiescent state.
416 */
417void __dev_remove_pack(struct packet_type *pt)
418{
c07b68e8 419 struct list_head *head = ptype_head(pt);
1da177e4
LT
420 struct packet_type *pt1;
421
c07b68e8 422 spin_lock(&ptype_lock);
1da177e4
LT
423
424 list_for_each_entry(pt1, head, list) {
425 if (pt == pt1) {
426 list_del_rcu(&pt->list);
427 goto out;
428 }
429 }
430
7b6cd1ce 431 pr_warn("dev_remove_pack: %p not found\n", pt);
1da177e4 432out:
c07b68e8 433 spin_unlock(&ptype_lock);
1da177e4 434}
d1b19dff
ED
435EXPORT_SYMBOL(__dev_remove_pack);
436
1da177e4
LT
437/**
438 * dev_remove_pack - remove packet handler
439 * @pt: packet type declaration
440 *
441 * Remove a protocol handler that was previously added to the kernel
442 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
443 * from the kernel lists and can be freed or reused once this function
444 * returns.
445 *
446 * This call sleeps to guarantee that no CPU is looking at the packet
447 * type after return.
448 */
449void dev_remove_pack(struct packet_type *pt)
450{
451 __dev_remove_pack(pt);
4ec93edb 452
1da177e4
LT
453 synchronize_net();
454}
d1b19dff 455EXPORT_SYMBOL(dev_remove_pack);
1da177e4 456
62532da9
VY
457
458/**
459 * dev_add_offload - register offload handlers
460 * @po: protocol offload declaration
461 *
462 * Add protocol offload handlers to the networking stack. The passed
463 * &proto_offload is linked into kernel lists and may not be freed until
464 * it has been removed from the kernel lists.
465 *
466 * This call does not sleep therefore it can not
467 * guarantee all CPU's that are in middle of receiving packets
468 * will see the new offload handlers (until the next received packet).
469 */
470void dev_add_offload(struct packet_offload *po)
471{
bdef7de4 472 struct packet_offload *elem;
62532da9
VY
473
474 spin_lock(&offload_lock);
bdef7de4
DM
475 list_for_each_entry(elem, &offload_base, list) {
476 if (po->priority < elem->priority)
477 break;
478 }
479 list_add_rcu(&po->list, elem->list.prev);
62532da9
VY
480 spin_unlock(&offload_lock);
481}
482EXPORT_SYMBOL(dev_add_offload);
483
484/**
485 * __dev_remove_offload - remove offload handler
486 * @po: packet offload declaration
487 *
488 * Remove a protocol offload handler that was previously added to the
489 * kernel offload handlers by dev_add_offload(). The passed &offload_type
490 * is removed from the kernel lists and can be freed or reused once this
491 * function returns.
492 *
493 * The packet type might still be in use by receivers
494 * and must not be freed until after all the CPU's have gone
495 * through a quiescent state.
496 */
1d143d9f 497static void __dev_remove_offload(struct packet_offload *po)
62532da9
VY
498{
499 struct list_head *head = &offload_base;
500 struct packet_offload *po1;
501
c53aa505 502 spin_lock(&offload_lock);
62532da9
VY
503
504 list_for_each_entry(po1, head, list) {
505 if (po == po1) {
506 list_del_rcu(&po->list);
507 goto out;
508 }
509 }
510
511 pr_warn("dev_remove_offload: %p not found\n", po);
512out:
c53aa505 513 spin_unlock(&offload_lock);
62532da9 514}
62532da9
VY
515
516/**
517 * dev_remove_offload - remove packet offload handler
518 * @po: packet offload declaration
519 *
520 * Remove a packet offload handler that was previously added to the kernel
521 * offload handlers by dev_add_offload(). The passed &offload_type is
522 * removed from the kernel lists and can be freed or reused once this
523 * function returns.
524 *
525 * This call sleeps to guarantee that no CPU is looking at the packet
526 * type after return.
527 */
528void dev_remove_offload(struct packet_offload *po)
529{
530 __dev_remove_offload(po);
531
532 synchronize_net();
533}
534EXPORT_SYMBOL(dev_remove_offload);
535
1da177e4
LT
536/******************************************************************************
537
538 Device Boot-time Settings Routines
539
540*******************************************************************************/
541
542/* Boot time configuration table */
543static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
544
545/**
546 * netdev_boot_setup_add - add new setup entry
547 * @name: name of the device
548 * @map: configured settings for the device
549 *
550 * Adds new setup entry to the dev_boot_setup list. The function
551 * returns 0 on error and 1 on success. This is a generic routine to
552 * all netdevices.
553 */
554static int netdev_boot_setup_add(char *name, struct ifmap *map)
555{
556 struct netdev_boot_setup *s;
557 int i;
558
559 s = dev_boot_setup;
560 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
561 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
562 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 563 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
564 memcpy(&s[i].map, map, sizeof(s[i].map));
565 break;
566 }
567 }
568
569 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
570}
571
572/**
573 * netdev_boot_setup_check - check boot time settings
574 * @dev: the netdevice
575 *
576 * Check boot time settings for the device.
577 * The found settings are set for the device to be used
578 * later in the device probing.
579 * Returns 0 if no settings found, 1 if they are.
580 */
581int netdev_boot_setup_check(struct net_device *dev)
582{
583 struct netdev_boot_setup *s = dev_boot_setup;
584 int i;
585
586 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
587 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 588 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
589 dev->irq = s[i].map.irq;
590 dev->base_addr = s[i].map.base_addr;
591 dev->mem_start = s[i].map.mem_start;
592 dev->mem_end = s[i].map.mem_end;
593 return 1;
594 }
595 }
596 return 0;
597}
d1b19dff 598EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
599
600
601/**
602 * netdev_boot_base - get address from boot time settings
603 * @prefix: prefix for network device
604 * @unit: id for network device
605 *
606 * Check boot time settings for the base address of device.
607 * The found settings are set for the device to be used
608 * later in the device probing.
609 * Returns 0 if no settings found.
610 */
611unsigned long netdev_boot_base(const char *prefix, int unit)
612{
613 const struct netdev_boot_setup *s = dev_boot_setup;
614 char name[IFNAMSIZ];
615 int i;
616
617 sprintf(name, "%s%d", prefix, unit);
618
619 /*
620 * If device already registered then return base of 1
621 * to indicate not to probe for this interface
622 */
881d966b 623 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
624 return 1;
625
626 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
627 if (!strcmp(name, s[i].name))
628 return s[i].map.base_addr;
629 return 0;
630}
631
632/*
633 * Saves at boot time configured settings for any netdevice.
634 */
635int __init netdev_boot_setup(char *str)
636{
637 int ints[5];
638 struct ifmap map;
639
640 str = get_options(str, ARRAY_SIZE(ints), ints);
641 if (!str || !*str)
642 return 0;
643
644 /* Save settings */
645 memset(&map, 0, sizeof(map));
646 if (ints[0] > 0)
647 map.irq = ints[1];
648 if (ints[0] > 1)
649 map.base_addr = ints[2];
650 if (ints[0] > 2)
651 map.mem_start = ints[3];
652 if (ints[0] > 3)
653 map.mem_end = ints[4];
654
655 /* Add new entry to the list */
656 return netdev_boot_setup_add(str, &map);
657}
658
659__setup("netdev=", netdev_boot_setup);
660
661/*******************************************************************************
662
663 Device Interface Subroutines
664
665*******************************************************************************/
666
a54acb3a
ND
667/**
668 * dev_get_iflink - get 'iflink' value of a interface
669 * @dev: targeted interface
670 *
671 * Indicates the ifindex the interface is linked to.
672 * Physical interfaces have the same 'ifindex' and 'iflink' values.
673 */
674
675int dev_get_iflink(const struct net_device *dev)
676{
677 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
678 return dev->netdev_ops->ndo_get_iflink(dev);
679
7a66bbc9 680 return dev->ifindex;
a54acb3a
ND
681}
682EXPORT_SYMBOL(dev_get_iflink);
683
1da177e4
LT
684/**
685 * __dev_get_by_name - find a device by its name
c4ea43c5 686 * @net: the applicable net namespace
1da177e4
LT
687 * @name: name to find
688 *
689 * Find an interface by name. Must be called under RTNL semaphore
690 * or @dev_base_lock. If the name is found a pointer to the device
691 * is returned. If the name is not found then %NULL is returned. The
692 * reference counters are not incremented so the caller must be
693 * careful with locks.
694 */
695
881d966b 696struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4 697{
0bd8d536
ED
698 struct net_device *dev;
699 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 700
b67bfe0d 701 hlist_for_each_entry(dev, head, name_hlist)
1da177e4
LT
702 if (!strncmp(dev->name, name, IFNAMSIZ))
703 return dev;
0bd8d536 704
1da177e4
LT
705 return NULL;
706}
d1b19dff 707EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 708
72c9528b
ED
709/**
710 * dev_get_by_name_rcu - find a device by its name
711 * @net: the applicable net namespace
712 * @name: name to find
713 *
714 * Find an interface by name.
715 * If the name is found a pointer to the device is returned.
716 * If the name is not found then %NULL is returned.
717 * The reference counters are not incremented so the caller must be
718 * careful with locks. The caller must hold RCU lock.
719 */
720
721struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
722{
72c9528b
ED
723 struct net_device *dev;
724 struct hlist_head *head = dev_name_hash(net, name);
725
b67bfe0d 726 hlist_for_each_entry_rcu(dev, head, name_hlist)
72c9528b
ED
727 if (!strncmp(dev->name, name, IFNAMSIZ))
728 return dev;
729
730 return NULL;
731}
732EXPORT_SYMBOL(dev_get_by_name_rcu);
733
1da177e4
LT
734/**
735 * dev_get_by_name - find a device by its name
c4ea43c5 736 * @net: the applicable net namespace
1da177e4
LT
737 * @name: name to find
738 *
739 * Find an interface by name. This can be called from any
740 * context and does its own locking. The returned handle has
741 * the usage count incremented and the caller must use dev_put() to
742 * release it when it is no longer needed. %NULL is returned if no
743 * matching device is found.
744 */
745
881d966b 746struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
747{
748 struct net_device *dev;
749
72c9528b
ED
750 rcu_read_lock();
751 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
752 if (dev)
753 dev_hold(dev);
72c9528b 754 rcu_read_unlock();
1da177e4
LT
755 return dev;
756}
d1b19dff 757EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
758
759/**
760 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 761 * @net: the applicable net namespace
1da177e4
LT
762 * @ifindex: index of device
763 *
764 * Search for an interface by index. Returns %NULL if the device
765 * is not found or a pointer to the device. The device has not
766 * had its reference counter increased so the caller must be careful
767 * about locking. The caller must hold either the RTNL semaphore
768 * or @dev_base_lock.
769 */
770
881d966b 771struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4 772{
0bd8d536
ED
773 struct net_device *dev;
774 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 775
b67bfe0d 776 hlist_for_each_entry(dev, head, index_hlist)
1da177e4
LT
777 if (dev->ifindex == ifindex)
778 return dev;
0bd8d536 779
1da177e4
LT
780 return NULL;
781}
d1b19dff 782EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 783
fb699dfd
ED
784/**
785 * dev_get_by_index_rcu - find a device by its ifindex
786 * @net: the applicable net namespace
787 * @ifindex: index of device
788 *
789 * Search for an interface by index. Returns %NULL if the device
790 * is not found or a pointer to the device. The device has not
791 * had its reference counter increased so the caller must be careful
792 * about locking. The caller must hold RCU lock.
793 */
794
795struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
796{
fb699dfd
ED
797 struct net_device *dev;
798 struct hlist_head *head = dev_index_hash(net, ifindex);
799
b67bfe0d 800 hlist_for_each_entry_rcu(dev, head, index_hlist)
fb699dfd
ED
801 if (dev->ifindex == ifindex)
802 return dev;
803
804 return NULL;
805}
806EXPORT_SYMBOL(dev_get_by_index_rcu);
807
1da177e4
LT
808
809/**
810 * dev_get_by_index - find a device by its ifindex
c4ea43c5 811 * @net: the applicable net namespace
1da177e4
LT
812 * @ifindex: index of device
813 *
814 * Search for an interface by index. Returns NULL if the device
815 * is not found or a pointer to the device. The device returned has
816 * had a reference added and the pointer is safe until the user calls
817 * dev_put to indicate they have finished with it.
818 */
819
881d966b 820struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
821{
822 struct net_device *dev;
823
fb699dfd
ED
824 rcu_read_lock();
825 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
826 if (dev)
827 dev_hold(dev);
fb699dfd 828 rcu_read_unlock();
1da177e4
LT
829 return dev;
830}
d1b19dff 831EXPORT_SYMBOL(dev_get_by_index);
1da177e4 832
5dbe7c17
NS
833/**
834 * netdev_get_name - get a netdevice name, knowing its ifindex.
835 * @net: network namespace
836 * @name: a pointer to the buffer where the name will be stored.
837 * @ifindex: the ifindex of the interface to get the name from.
838 *
839 * The use of raw_seqcount_begin() and cond_resched() before
840 * retrying is required as we want to give the writers a chance
841 * to complete when CONFIG_PREEMPT is not set.
842 */
843int netdev_get_name(struct net *net, char *name, int ifindex)
844{
845 struct net_device *dev;
846 unsigned int seq;
847
848retry:
849 seq = raw_seqcount_begin(&devnet_rename_seq);
850 rcu_read_lock();
851 dev = dev_get_by_index_rcu(net, ifindex);
852 if (!dev) {
853 rcu_read_unlock();
854 return -ENODEV;
855 }
856
857 strcpy(name, dev->name);
858 rcu_read_unlock();
859 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
860 cond_resched();
861 goto retry;
862 }
863
864 return 0;
865}
866
1da177e4 867/**
941666c2 868 * dev_getbyhwaddr_rcu - find a device by its hardware address
c4ea43c5 869 * @net: the applicable net namespace
1da177e4
LT
870 * @type: media type of device
871 * @ha: hardware address
872 *
873 * Search for an interface by MAC address. Returns NULL if the device
c506653d
ED
874 * is not found or a pointer to the device.
875 * The caller must hold RCU or RTNL.
941666c2 876 * The returned device has not had its ref count increased
1da177e4
LT
877 * and the caller must therefore be careful about locking
878 *
1da177e4
LT
879 */
880
941666c2
ED
881struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
882 const char *ha)
1da177e4
LT
883{
884 struct net_device *dev;
885
941666c2 886 for_each_netdev_rcu(net, dev)
1da177e4
LT
887 if (dev->type == type &&
888 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
889 return dev;
890
891 return NULL;
1da177e4 892}
941666c2 893EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
cf309e3f 894
881d966b 895struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
896{
897 struct net_device *dev;
898
4e9cac2b 899 ASSERT_RTNL();
881d966b 900 for_each_netdev(net, dev)
4e9cac2b 901 if (dev->type == type)
7562f876
PE
902 return dev;
903
904 return NULL;
4e9cac2b 905}
4e9cac2b
PM
906EXPORT_SYMBOL(__dev_getfirstbyhwtype);
907
881d966b 908struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 909{
99fe3c39 910 struct net_device *dev, *ret = NULL;
4e9cac2b 911
99fe3c39
ED
912 rcu_read_lock();
913 for_each_netdev_rcu(net, dev)
914 if (dev->type == type) {
915 dev_hold(dev);
916 ret = dev;
917 break;
918 }
919 rcu_read_unlock();
920 return ret;
1da177e4 921}
1da177e4
LT
922EXPORT_SYMBOL(dev_getfirstbyhwtype);
923
924/**
6c555490 925 * __dev_get_by_flags - find any device with given flags
c4ea43c5 926 * @net: the applicable net namespace
1da177e4
LT
927 * @if_flags: IFF_* values
928 * @mask: bitmask of bits in if_flags to check
929 *
930 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04 931 * is not found or a pointer to the device. Must be called inside
6c555490 932 * rtnl_lock(), and result refcount is unchanged.
1da177e4
LT
933 */
934
6c555490
WC
935struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
936 unsigned short mask)
1da177e4 937{
7562f876 938 struct net_device *dev, *ret;
1da177e4 939
6c555490
WC
940 ASSERT_RTNL();
941
7562f876 942 ret = NULL;
6c555490 943 for_each_netdev(net, dev) {
1da177e4 944 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 945 ret = dev;
1da177e4
LT
946 break;
947 }
948 }
7562f876 949 return ret;
1da177e4 950}
6c555490 951EXPORT_SYMBOL(__dev_get_by_flags);
1da177e4
LT
952
953/**
954 * dev_valid_name - check if name is okay for network device
955 * @name: name string
956 *
957 * Network device names need to be valid file names to
c7fa9d18
DM
958 * to allow sysfs to work. We also disallow any kind of
959 * whitespace.
1da177e4 960 */
95f050bf 961bool dev_valid_name(const char *name)
1da177e4 962{
c7fa9d18 963 if (*name == '\0')
95f050bf 964 return false;
b6fe17d6 965 if (strlen(name) >= IFNAMSIZ)
95f050bf 966 return false;
c7fa9d18 967 if (!strcmp(name, ".") || !strcmp(name, ".."))
95f050bf 968 return false;
c7fa9d18
DM
969
970 while (*name) {
a4176a93 971 if (*name == '/' || *name == ':' || isspace(*name))
95f050bf 972 return false;
c7fa9d18
DM
973 name++;
974 }
95f050bf 975 return true;
1da177e4 976}
d1b19dff 977EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
978
979/**
b267b179
EB
980 * __dev_alloc_name - allocate a name for a device
981 * @net: network namespace to allocate the device name in
1da177e4 982 * @name: name format string
b267b179 983 * @buf: scratch buffer and result name string
1da177e4
LT
984 *
985 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
986 * id. It scans list of devices to build up a free map, then chooses
987 * the first empty slot. The caller must hold the dev_base or rtnl lock
988 * while allocating the name and adding the device in order to avoid
989 * duplicates.
990 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
991 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
992 */
993
b267b179 994static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
995{
996 int i = 0;
1da177e4
LT
997 const char *p;
998 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 999 unsigned long *inuse;
1da177e4
LT
1000 struct net_device *d;
1001
1002 p = strnchr(name, IFNAMSIZ-1, '%');
1003 if (p) {
1004 /*
1005 * Verify the string as this thing may have come from
1006 * the user. There must be either one "%d" and no other "%"
1007 * characters.
1008 */
1009 if (p[1] != 'd' || strchr(p + 2, '%'))
1010 return -EINVAL;
1011
1012 /* Use one page as a bit array of possible slots */
cfcabdcc 1013 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
1014 if (!inuse)
1015 return -ENOMEM;
1016
881d966b 1017 for_each_netdev(net, d) {
1da177e4
LT
1018 if (!sscanf(d->name, name, &i))
1019 continue;
1020 if (i < 0 || i >= max_netdevices)
1021 continue;
1022
1023 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 1024 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
1025 if (!strncmp(buf, d->name, IFNAMSIZ))
1026 set_bit(i, inuse);
1027 }
1028
1029 i = find_first_zero_bit(inuse, max_netdevices);
1030 free_page((unsigned long) inuse);
1031 }
1032
d9031024
OP
1033 if (buf != name)
1034 snprintf(buf, IFNAMSIZ, name, i);
b267b179 1035 if (!__dev_get_by_name(net, buf))
1da177e4 1036 return i;
1da177e4
LT
1037
1038 /* It is possible to run out of possible slots
1039 * when the name is long and there isn't enough space left
1040 * for the digits, or if all bits are used.
1041 */
1042 return -ENFILE;
1043}
1044
b267b179
EB
1045/**
1046 * dev_alloc_name - allocate a name for a device
1047 * @dev: device
1048 * @name: name format string
1049 *
1050 * Passed a format string - eg "lt%d" it will try and find a suitable
1051 * id. It scans list of devices to build up a free map, then chooses
1052 * the first empty slot. The caller must hold the dev_base or rtnl lock
1053 * while allocating the name and adding the device in order to avoid
1054 * duplicates.
1055 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1056 * Returns the number of the unit assigned or a negative errno code.
1057 */
1058
1059int dev_alloc_name(struct net_device *dev, const char *name)
1060{
1061 char buf[IFNAMSIZ];
1062 struct net *net;
1063 int ret;
1064
c346dca1
YH
1065 BUG_ON(!dev_net(dev));
1066 net = dev_net(dev);
b267b179
EB
1067 ret = __dev_alloc_name(net, name, buf);
1068 if (ret >= 0)
1069 strlcpy(dev->name, buf, IFNAMSIZ);
1070 return ret;
1071}
d1b19dff 1072EXPORT_SYMBOL(dev_alloc_name);
b267b179 1073
828de4f6
G
1074static int dev_alloc_name_ns(struct net *net,
1075 struct net_device *dev,
1076 const char *name)
d9031024 1077{
828de4f6
G
1078 char buf[IFNAMSIZ];
1079 int ret;
8ce6cebc 1080
828de4f6
G
1081 ret = __dev_alloc_name(net, name, buf);
1082 if (ret >= 0)
1083 strlcpy(dev->name, buf, IFNAMSIZ);
1084 return ret;
1085}
1086
1087static int dev_get_valid_name(struct net *net,
1088 struct net_device *dev,
1089 const char *name)
1090{
1091 BUG_ON(!net);
8ce6cebc 1092
d9031024
OP
1093 if (!dev_valid_name(name))
1094 return -EINVAL;
1095
1c5cae81 1096 if (strchr(name, '%'))
828de4f6 1097 return dev_alloc_name_ns(net, dev, name);
d9031024
OP
1098 else if (__dev_get_by_name(net, name))
1099 return -EEXIST;
8ce6cebc
DL
1100 else if (dev->name != name)
1101 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
1102
1103 return 0;
1104}
1da177e4
LT
1105
1106/**
1107 * dev_change_name - change name of a device
1108 * @dev: device
1109 * @newname: name (or format string) must be at least IFNAMSIZ
1110 *
1111 * Change name of a device, can pass format strings "eth%d".
1112 * for wildcarding.
1113 */
cf04a4c7 1114int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 1115{
238fa362 1116 unsigned char old_assign_type;
fcc5a03a 1117 char oldname[IFNAMSIZ];
1da177e4 1118 int err = 0;
fcc5a03a 1119 int ret;
881d966b 1120 struct net *net;
1da177e4
LT
1121
1122 ASSERT_RTNL();
c346dca1 1123 BUG_ON(!dev_net(dev));
1da177e4 1124
c346dca1 1125 net = dev_net(dev);
1da177e4
LT
1126 if (dev->flags & IFF_UP)
1127 return -EBUSY;
1128
30e6c9fa 1129 write_seqcount_begin(&devnet_rename_seq);
c91f6df2
BH
1130
1131 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
30e6c9fa 1132 write_seqcount_end(&devnet_rename_seq);
c8d90dca 1133 return 0;
c91f6df2 1134 }
c8d90dca 1135
fcc5a03a
HX
1136 memcpy(oldname, dev->name, IFNAMSIZ);
1137
828de4f6 1138 err = dev_get_valid_name(net, dev, newname);
c91f6df2 1139 if (err < 0) {
30e6c9fa 1140 write_seqcount_end(&devnet_rename_seq);
d9031024 1141 return err;
c91f6df2 1142 }
1da177e4 1143
6fe82a39
VF
1144 if (oldname[0] && !strchr(oldname, '%'))
1145 netdev_info(dev, "renamed from %s\n", oldname);
1146
238fa362
TG
1147 old_assign_type = dev->name_assign_type;
1148 dev->name_assign_type = NET_NAME_RENAMED;
1149
fcc5a03a 1150rollback:
a1b3f594
EB
1151 ret = device_rename(&dev->dev, dev->name);
1152 if (ret) {
1153 memcpy(dev->name, oldname, IFNAMSIZ);
238fa362 1154 dev->name_assign_type = old_assign_type;
30e6c9fa 1155 write_seqcount_end(&devnet_rename_seq);
a1b3f594 1156 return ret;
dcc99773 1157 }
7f988eab 1158
30e6c9fa 1159 write_seqcount_end(&devnet_rename_seq);
c91f6df2 1160
5bb025fa
VF
1161 netdev_adjacent_rename_links(dev, oldname);
1162
7f988eab 1163 write_lock_bh(&dev_base_lock);
372b2312 1164 hlist_del_rcu(&dev->name_hlist);
72c9528b
ED
1165 write_unlock_bh(&dev_base_lock);
1166
1167 synchronize_rcu();
1168
1169 write_lock_bh(&dev_base_lock);
1170 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1171 write_unlock_bh(&dev_base_lock);
1172
056925ab 1173 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1174 ret = notifier_to_errno(ret);
1175
1176 if (ret) {
91e9c07b
ED
1177 /* err >= 0 after dev_alloc_name() or stores the first errno */
1178 if (err >= 0) {
fcc5a03a 1179 err = ret;
30e6c9fa 1180 write_seqcount_begin(&devnet_rename_seq);
fcc5a03a 1181 memcpy(dev->name, oldname, IFNAMSIZ);
5bb025fa 1182 memcpy(oldname, newname, IFNAMSIZ);
238fa362
TG
1183 dev->name_assign_type = old_assign_type;
1184 old_assign_type = NET_NAME_RENAMED;
fcc5a03a 1185 goto rollback;
91e9c07b 1186 } else {
7b6cd1ce 1187 pr_err("%s: name change rollback failed: %d\n",
91e9c07b 1188 dev->name, ret);
fcc5a03a
HX
1189 }
1190 }
1da177e4
LT
1191
1192 return err;
1193}
1194
0b815a1a
SH
1195/**
1196 * dev_set_alias - change ifalias of a device
1197 * @dev: device
1198 * @alias: name up to IFALIASZ
f0db275a 1199 * @len: limit of bytes to copy from info
0b815a1a
SH
1200 *
1201 * Set ifalias for a device,
1202 */
1203int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1204{
7364e445
AK
1205 char *new_ifalias;
1206
0b815a1a
SH
1207 ASSERT_RTNL();
1208
1209 if (len >= IFALIASZ)
1210 return -EINVAL;
1211
96ca4a2c 1212 if (!len) {
388dfc2d
SK
1213 kfree(dev->ifalias);
1214 dev->ifalias = NULL;
96ca4a2c
OH
1215 return 0;
1216 }
1217
7364e445
AK
1218 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1219 if (!new_ifalias)
0b815a1a 1220 return -ENOMEM;
7364e445 1221 dev->ifalias = new_ifalias;
0b815a1a
SH
1222
1223 strlcpy(dev->ifalias, alias, len+1);
1224 return len;
1225}
1226
1227
d8a33ac4 1228/**
3041a069 1229 * netdev_features_change - device changes features
d8a33ac4
SH
1230 * @dev: device to cause notification
1231 *
1232 * Called to indicate a device has changed features.
1233 */
1234void netdev_features_change(struct net_device *dev)
1235{
056925ab 1236 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1237}
1238EXPORT_SYMBOL(netdev_features_change);
1239
1da177e4
LT
1240/**
1241 * netdev_state_change - device changes state
1242 * @dev: device to cause notification
1243 *
1244 * Called to indicate a device has changed state. This function calls
1245 * the notifier chains for netdev_chain and sends a NEWLINK message
1246 * to the routing socket.
1247 */
1248void netdev_state_change(struct net_device *dev)
1249{
1250 if (dev->flags & IFF_UP) {
54951194
LP
1251 struct netdev_notifier_change_info change_info;
1252
1253 change_info.flags_changed = 0;
1254 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1255 &change_info.info);
7f294054 1256 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1da177e4
LT
1257 }
1258}
d1b19dff 1259EXPORT_SYMBOL(netdev_state_change);
1da177e4 1260
ee89bab1
AW
1261/**
1262 * netdev_notify_peers - notify network peers about existence of @dev
1263 * @dev: network device
1264 *
1265 * Generate traffic such that interested network peers are aware of
1266 * @dev, such as by generating a gratuitous ARP. This may be used when
1267 * a device wants to inform the rest of the network about some sort of
1268 * reconfiguration such as a failover event or virtual machine
1269 * migration.
1270 */
1271void netdev_notify_peers(struct net_device *dev)
c1da4ac7 1272{
ee89bab1
AW
1273 rtnl_lock();
1274 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1275 rtnl_unlock();
c1da4ac7 1276}
ee89bab1 1277EXPORT_SYMBOL(netdev_notify_peers);
c1da4ac7 1278
bd380811 1279static int __dev_open(struct net_device *dev)
1da177e4 1280{
d314774c 1281 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1282 int ret;
1da177e4 1283
e46b66bc
BH
1284 ASSERT_RTNL();
1285
1da177e4
LT
1286 if (!netif_device_present(dev))
1287 return -ENODEV;
1288
ca99ca14
NH
1289 /* Block netpoll from trying to do any rx path servicing.
1290 * If we don't do this there is a chance ndo_poll_controller
1291 * or ndo_poll may be running while we open the device
1292 */
66b5552f 1293 netpoll_poll_disable(dev);
ca99ca14 1294
3b8bcfd5
JB
1295 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1296 ret = notifier_to_errno(ret);
1297 if (ret)
1298 return ret;
1299
1da177e4 1300 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1301
d314774c
SH
1302 if (ops->ndo_validate_addr)
1303 ret = ops->ndo_validate_addr(dev);
bada339b 1304
d314774c
SH
1305 if (!ret && ops->ndo_open)
1306 ret = ops->ndo_open(dev);
1da177e4 1307
66b5552f 1308 netpoll_poll_enable(dev);
ca99ca14 1309
bada339b
JG
1310 if (ret)
1311 clear_bit(__LINK_STATE_START, &dev->state);
1312 else {
1da177e4 1313 dev->flags |= IFF_UP;
4417da66 1314 dev_set_rx_mode(dev);
1da177e4 1315 dev_activate(dev);
7bf23575 1316 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 1317 }
bada339b 1318
1da177e4
LT
1319 return ret;
1320}
1321
1322/**
bd380811
PM
1323 * dev_open - prepare an interface for use.
1324 * @dev: device to open
1da177e4 1325 *
bd380811
PM
1326 * Takes a device from down to up state. The device's private open
1327 * function is invoked and then the multicast lists are loaded. Finally
1328 * the device is moved into the up state and a %NETDEV_UP message is
1329 * sent to the netdev notifier chain.
1330 *
1331 * Calling this function on an active interface is a nop. On a failure
1332 * a negative errno code is returned.
1da177e4 1333 */
bd380811
PM
1334int dev_open(struct net_device *dev)
1335{
1336 int ret;
1337
bd380811
PM
1338 if (dev->flags & IFF_UP)
1339 return 0;
1340
bd380811
PM
1341 ret = __dev_open(dev);
1342 if (ret < 0)
1343 return ret;
1344
7f294054 1345 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
bd380811
PM
1346 call_netdevice_notifiers(NETDEV_UP, dev);
1347
1348 return ret;
1349}
1350EXPORT_SYMBOL(dev_open);
1351
44345724 1352static int __dev_close_many(struct list_head *head)
1da177e4 1353{
44345724 1354 struct net_device *dev;
e46b66bc 1355
bd380811 1356 ASSERT_RTNL();
9d5010db
DM
1357 might_sleep();
1358
5cde2829 1359 list_for_each_entry(dev, head, close_list) {
3f4df206 1360 /* Temporarily disable netpoll until the interface is down */
66b5552f 1361 netpoll_poll_disable(dev);
3f4df206 1362
44345724 1363 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1364
44345724 1365 clear_bit(__LINK_STATE_START, &dev->state);
1da177e4 1366
44345724
OP
1367 /* Synchronize to scheduled poll. We cannot touch poll list, it
1368 * can be even on different cpu. So just clear netif_running().
1369 *
1370 * dev->stop() will invoke napi_disable() on all of it's
1371 * napi_struct instances on this device.
1372 */
4e857c58 1373 smp_mb__after_atomic(); /* Commit netif_running(). */
44345724 1374 }
1da177e4 1375
44345724 1376 dev_deactivate_many(head);
d8b2a4d2 1377
5cde2829 1378 list_for_each_entry(dev, head, close_list) {
44345724 1379 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4 1380
44345724
OP
1381 /*
1382 * Call the device specific close. This cannot fail.
1383 * Only if device is UP
1384 *
1385 * We allow it to be called even after a DETACH hot-plug
1386 * event.
1387 */
1388 if (ops->ndo_stop)
1389 ops->ndo_stop(dev);
1390
44345724 1391 dev->flags &= ~IFF_UP;
66b5552f 1392 netpoll_poll_enable(dev);
44345724
OP
1393 }
1394
1395 return 0;
1396}
1397
1398static int __dev_close(struct net_device *dev)
1399{
f87e6f47 1400 int retval;
44345724
OP
1401 LIST_HEAD(single);
1402
5cde2829 1403 list_add(&dev->close_list, &single);
f87e6f47
LT
1404 retval = __dev_close_many(&single);
1405 list_del(&single);
ca99ca14 1406
f87e6f47 1407 return retval;
44345724
OP
1408}
1409
99c4a26a 1410int dev_close_many(struct list_head *head, bool unlink)
44345724
OP
1411{
1412 struct net_device *dev, *tmp;
1da177e4 1413
5cde2829
EB
1414 /* Remove the devices that don't need to be closed */
1415 list_for_each_entry_safe(dev, tmp, head, close_list)
44345724 1416 if (!(dev->flags & IFF_UP))
5cde2829 1417 list_del_init(&dev->close_list);
44345724
OP
1418
1419 __dev_close_many(head);
1da177e4 1420
5cde2829 1421 list_for_each_entry_safe(dev, tmp, head, close_list) {
7f294054 1422 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
44345724 1423 call_netdevice_notifiers(NETDEV_DOWN, dev);
99c4a26a
DM
1424 if (unlink)
1425 list_del_init(&dev->close_list);
44345724 1426 }
bd380811
PM
1427
1428 return 0;
1429}
99c4a26a 1430EXPORT_SYMBOL(dev_close_many);
bd380811
PM
1431
1432/**
1433 * dev_close - shutdown an interface.
1434 * @dev: device to shutdown
1435 *
1436 * This function moves an active device into down state. A
1437 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1438 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1439 * chain.
1440 */
1441int dev_close(struct net_device *dev)
1442{
e14a5993
ED
1443 if (dev->flags & IFF_UP) {
1444 LIST_HEAD(single);
1da177e4 1445
5cde2829 1446 list_add(&dev->close_list, &single);
99c4a26a 1447 dev_close_many(&single, true);
e14a5993
ED
1448 list_del(&single);
1449 }
da6e378b 1450 return 0;
1da177e4 1451}
d1b19dff 1452EXPORT_SYMBOL(dev_close);
1da177e4
LT
1453
1454
0187bdfb
BH
1455/**
1456 * dev_disable_lro - disable Large Receive Offload on a device
1457 * @dev: device
1458 *
1459 * Disable Large Receive Offload (LRO) on a net device. Must be
1460 * called under RTNL. This is needed if received packets may be
1461 * forwarded to another interface.
1462 */
1463void dev_disable_lro(struct net_device *dev)
1464{
fbe168ba
MK
1465 struct net_device *lower_dev;
1466 struct list_head *iter;
529d0489 1467
bc5787c6
MM
1468 dev->wanted_features &= ~NETIF_F_LRO;
1469 netdev_update_features(dev);
27660515 1470
22d5969f
MM
1471 if (unlikely(dev->features & NETIF_F_LRO))
1472 netdev_WARN(dev, "failed to disable LRO!\n");
fbe168ba
MK
1473
1474 netdev_for_each_lower_dev(dev, lower_dev, iter)
1475 dev_disable_lro(lower_dev);
0187bdfb
BH
1476}
1477EXPORT_SYMBOL(dev_disable_lro);
1478
351638e7
JP
1479static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1480 struct net_device *dev)
1481{
1482 struct netdev_notifier_info info;
1483
1484 netdev_notifier_info_init(&info, dev);
1485 return nb->notifier_call(nb, val, &info);
1486}
0187bdfb 1487
881d966b
EB
1488static int dev_boot_phase = 1;
1489
1da177e4
LT
1490/**
1491 * register_netdevice_notifier - register a network notifier block
1492 * @nb: notifier
1493 *
1494 * Register a notifier to be called when network device events occur.
1495 * The notifier passed is linked into the kernel structures and must
1496 * not be reused until it has been unregistered. A negative errno code
1497 * is returned on a failure.
1498 *
1499 * When registered all registration and up events are replayed
4ec93edb 1500 * to the new notifier to allow device to have a race free
1da177e4
LT
1501 * view of the network device list.
1502 */
1503
1504int register_netdevice_notifier(struct notifier_block *nb)
1505{
1506 struct net_device *dev;
fcc5a03a 1507 struct net_device *last;
881d966b 1508 struct net *net;
1da177e4
LT
1509 int err;
1510
1511 rtnl_lock();
f07d5b94 1512 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1513 if (err)
1514 goto unlock;
881d966b
EB
1515 if (dev_boot_phase)
1516 goto unlock;
1517 for_each_net(net) {
1518 for_each_netdev(net, dev) {
351638e7 1519 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
881d966b
EB
1520 err = notifier_to_errno(err);
1521 if (err)
1522 goto rollback;
1523
1524 if (!(dev->flags & IFF_UP))
1525 continue;
1da177e4 1526
351638e7 1527 call_netdevice_notifier(nb, NETDEV_UP, dev);
881d966b 1528 }
1da177e4 1529 }
fcc5a03a
HX
1530
1531unlock:
1da177e4
LT
1532 rtnl_unlock();
1533 return err;
fcc5a03a
HX
1534
1535rollback:
1536 last = dev;
881d966b
EB
1537 for_each_net(net) {
1538 for_each_netdev(net, dev) {
1539 if (dev == last)
8f891489 1540 goto outroll;
fcc5a03a 1541
881d966b 1542 if (dev->flags & IFF_UP) {
351638e7
JP
1543 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1544 dev);
1545 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
881d966b 1546 }
351638e7 1547 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
fcc5a03a 1548 }
fcc5a03a 1549 }
c67625a1 1550
8f891489 1551outroll:
c67625a1 1552 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1553 goto unlock;
1da177e4 1554}
d1b19dff 1555EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1556
1557/**
1558 * unregister_netdevice_notifier - unregister a network notifier block
1559 * @nb: notifier
1560 *
1561 * Unregister a notifier previously registered by
1562 * register_netdevice_notifier(). The notifier is unlinked into the
1563 * kernel structures and may then be reused. A negative errno code
1564 * is returned on a failure.
7d3d43da
EB
1565 *
1566 * After unregistering unregister and down device events are synthesized
1567 * for all devices on the device list to the removed notifier to remove
1568 * the need for special case cleanup code.
1da177e4
LT
1569 */
1570
1571int unregister_netdevice_notifier(struct notifier_block *nb)
1572{
7d3d43da
EB
1573 struct net_device *dev;
1574 struct net *net;
9f514950
HX
1575 int err;
1576
1577 rtnl_lock();
f07d5b94 1578 err = raw_notifier_chain_unregister(&netdev_chain, nb);
7d3d43da
EB
1579 if (err)
1580 goto unlock;
1581
1582 for_each_net(net) {
1583 for_each_netdev(net, dev) {
1584 if (dev->flags & IFF_UP) {
351638e7
JP
1585 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1586 dev);
1587 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
7d3d43da 1588 }
351638e7 1589 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
7d3d43da
EB
1590 }
1591 }
1592unlock:
9f514950
HX
1593 rtnl_unlock();
1594 return err;
1da177e4 1595}
d1b19dff 1596EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4 1597
351638e7
JP
1598/**
1599 * call_netdevice_notifiers_info - call all network notifier blocks
1600 * @val: value passed unmodified to notifier function
1601 * @dev: net_device pointer passed unmodified to notifier function
1602 * @info: notifier information data
1603 *
1604 * Call all network notifier blocks. Parameters and return value
1605 * are as for raw_notifier_call_chain().
1606 */
1607
1d143d9f 1608static int call_netdevice_notifiers_info(unsigned long val,
1609 struct net_device *dev,
1610 struct netdev_notifier_info *info)
351638e7
JP
1611{
1612 ASSERT_RTNL();
1613 netdev_notifier_info_init(info, dev);
1614 return raw_notifier_call_chain(&netdev_chain, val, info);
1615}
351638e7 1616
1da177e4
LT
1617/**
1618 * call_netdevice_notifiers - call all network notifier blocks
1619 * @val: value passed unmodified to notifier function
c4ea43c5 1620 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1621 *
1622 * Call all network notifier blocks. Parameters and return value
f07d5b94 1623 * are as for raw_notifier_call_chain().
1da177e4
LT
1624 */
1625
ad7379d4 1626int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1627{
351638e7
JP
1628 struct netdev_notifier_info info;
1629
1630 return call_netdevice_notifiers_info(val, dev, &info);
1da177e4 1631}
edf947f1 1632EXPORT_SYMBOL(call_netdevice_notifiers);
1da177e4 1633
1cf51900 1634#ifdef CONFIG_NET_INGRESS
4577139b
DB
1635static struct static_key ingress_needed __read_mostly;
1636
1637void net_inc_ingress_queue(void)
1638{
1639 static_key_slow_inc(&ingress_needed);
1640}
1641EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1642
1643void net_dec_ingress_queue(void)
1644{
1645 static_key_slow_dec(&ingress_needed);
1646}
1647EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1648#endif
1649
c5905afb 1650static struct static_key netstamp_needed __read_mostly;
b90e5794 1651#ifdef HAVE_JUMP_LABEL
c5905afb 1652/* We are not allowed to call static_key_slow_dec() from irq context
b90e5794 1653 * If net_disable_timestamp() is called from irq context, defer the
c5905afb 1654 * static_key_slow_dec() calls.
b90e5794
ED
1655 */
1656static atomic_t netstamp_needed_deferred;
1657#endif
1da177e4
LT
1658
1659void net_enable_timestamp(void)
1660{
b90e5794
ED
1661#ifdef HAVE_JUMP_LABEL
1662 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1663
1664 if (deferred) {
1665 while (--deferred)
c5905afb 1666 static_key_slow_dec(&netstamp_needed);
b90e5794
ED
1667 return;
1668 }
1669#endif
c5905afb 1670 static_key_slow_inc(&netstamp_needed);
1da177e4 1671}
d1b19dff 1672EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1673
1674void net_disable_timestamp(void)
1675{
b90e5794
ED
1676#ifdef HAVE_JUMP_LABEL
1677 if (in_interrupt()) {
1678 atomic_inc(&netstamp_needed_deferred);
1679 return;
1680 }
1681#endif
c5905afb 1682 static_key_slow_dec(&netstamp_needed);
1da177e4 1683}
d1b19dff 1684EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1685
3b098e2d 1686static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4 1687{
588f0330 1688 skb->tstamp.tv64 = 0;
c5905afb 1689 if (static_key_false(&netstamp_needed))
a61bbcf2 1690 __net_timestamp(skb);
1da177e4
LT
1691}
1692
588f0330 1693#define net_timestamp_check(COND, SKB) \
c5905afb 1694 if (static_key_false(&netstamp_needed)) { \
588f0330
ED
1695 if ((COND) && !(SKB)->tstamp.tv64) \
1696 __net_timestamp(SKB); \
1697 } \
3b098e2d 1698
1ee481fb 1699bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
79b569f0
DL
1700{
1701 unsigned int len;
1702
1703 if (!(dev->flags & IFF_UP))
1704 return false;
1705
1706 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1707 if (skb->len <= len)
1708 return true;
1709
1710 /* if TSO is enabled, we don't care about the length as the packet
1711 * could be forwarded without being segmented before
1712 */
1713 if (skb_is_gso(skb))
1714 return true;
1715
1716 return false;
1717}
1ee481fb 1718EXPORT_SYMBOL_GPL(is_skb_forwardable);
79b569f0 1719
a0265d28
HX
1720int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1721{
bbbf2df0
WB
1722 if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1723 unlikely(!is_skb_forwardable(dev, skb))) {
a0265d28
HX
1724 atomic_long_inc(&dev->rx_dropped);
1725 kfree_skb(skb);
1726 return NET_RX_DROP;
1727 }
1728
1729 skb_scrub_packet(skb, true);
08b4b8ea 1730 skb->priority = 0;
a0265d28 1731 skb->protocol = eth_type_trans(skb, dev);
2c26d34b 1732 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
a0265d28
HX
1733
1734 return 0;
1735}
1736EXPORT_SYMBOL_GPL(__dev_forward_skb);
1737
44540960
AB
1738/**
1739 * dev_forward_skb - loopback an skb to another netif
1740 *
1741 * @dev: destination network device
1742 * @skb: buffer to forward
1743 *
1744 * return values:
1745 * NET_RX_SUCCESS (no congestion)
6ec82562 1746 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1747 *
1748 * dev_forward_skb can be used for injecting an skb from the
1749 * start_xmit function of one device into the receive queue
1750 * of another device.
1751 *
1752 * The receiving device may be in another namespace, so
1753 * we have to clear all information in the skb that could
1754 * impact namespace isolation.
1755 */
1756int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1757{
a0265d28 1758 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
44540960
AB
1759}
1760EXPORT_SYMBOL_GPL(dev_forward_skb);
1761
71d9dec2
CG
1762static inline int deliver_skb(struct sk_buff *skb,
1763 struct packet_type *pt_prev,
1764 struct net_device *orig_dev)
1765{
1080e512
MT
1766 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1767 return -ENOMEM;
71d9dec2
CG
1768 atomic_inc(&skb->users);
1769 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1770}
1771
7866a621
SN
1772static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1773 struct packet_type **pt,
fbcb2170
JP
1774 struct net_device *orig_dev,
1775 __be16 type,
7866a621
SN
1776 struct list_head *ptype_list)
1777{
1778 struct packet_type *ptype, *pt_prev = *pt;
1779
1780 list_for_each_entry_rcu(ptype, ptype_list, list) {
1781 if (ptype->type != type)
1782 continue;
1783 if (pt_prev)
fbcb2170 1784 deliver_skb(skb, pt_prev, orig_dev);
7866a621
SN
1785 pt_prev = ptype;
1786 }
1787 *pt = pt_prev;
1788}
1789
c0de08d0
EL
1790static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1791{
a3d744e9 1792 if (!ptype->af_packet_priv || !skb->sk)
c0de08d0
EL
1793 return false;
1794
1795 if (ptype->id_match)
1796 return ptype->id_match(ptype, skb->sk);
1797 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1798 return true;
1799
1800 return false;
1801}
1802
1da177e4
LT
1803/*
1804 * Support routine. Sends outgoing frames to any network
1805 * taps currently in use.
1806 */
1807
f6a78bfc 1808static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1809{
1810 struct packet_type *ptype;
71d9dec2
CG
1811 struct sk_buff *skb2 = NULL;
1812 struct packet_type *pt_prev = NULL;
7866a621 1813 struct list_head *ptype_list = &ptype_all;
a61bbcf2 1814
1da177e4 1815 rcu_read_lock();
7866a621
SN
1816again:
1817 list_for_each_entry_rcu(ptype, ptype_list, list) {
1da177e4
LT
1818 /* Never send packets back to the socket
1819 * they originated from - MvS (miquels@drinkel.ow.org)
1820 */
7866a621
SN
1821 if (skb_loop_sk(ptype, skb))
1822 continue;
71d9dec2 1823
7866a621
SN
1824 if (pt_prev) {
1825 deliver_skb(skb2, pt_prev, skb->dev);
1826 pt_prev = ptype;
1827 continue;
1828 }
1da177e4 1829
7866a621
SN
1830 /* need to clone skb, done only once */
1831 skb2 = skb_clone(skb, GFP_ATOMIC);
1832 if (!skb2)
1833 goto out_unlock;
70978182 1834
7866a621 1835 net_timestamp_set(skb2);
1da177e4 1836
7866a621
SN
1837 /* skb->nh should be correctly
1838 * set by sender, so that the second statement is
1839 * just protection against buggy protocols.
1840 */
1841 skb_reset_mac_header(skb2);
1842
1843 if (skb_network_header(skb2) < skb2->data ||
1844 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1845 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1846 ntohs(skb2->protocol),
1847 dev->name);
1848 skb_reset_network_header(skb2);
1da177e4 1849 }
7866a621
SN
1850
1851 skb2->transport_header = skb2->network_header;
1852 skb2->pkt_type = PACKET_OUTGOING;
1853 pt_prev = ptype;
1854 }
1855
1856 if (ptype_list == &ptype_all) {
1857 ptype_list = &dev->ptype_all;
1858 goto again;
1da177e4 1859 }
7866a621 1860out_unlock:
71d9dec2
CG
1861 if (pt_prev)
1862 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1da177e4
LT
1863 rcu_read_unlock();
1864}
1865
2c53040f
BH
1866/**
1867 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
4f57c087
JF
1868 * @dev: Network device
1869 * @txq: number of queues available
1870 *
1871 * If real_num_tx_queues is changed the tc mappings may no longer be
1872 * valid. To resolve this verify the tc mapping remains valid and if
1873 * not NULL the mapping. With no priorities mapping to this
1874 * offset/count pair it will no longer be used. In the worst case TC0
1875 * is invalid nothing can be done so disable priority mappings. If is
1876 * expected that drivers will fix this mapping if they can before
1877 * calling netif_set_real_num_tx_queues.
1878 */
bb134d22 1879static void netif_setup_tc(struct net_device *dev, unsigned int txq)
4f57c087
JF
1880{
1881 int i;
1882 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1883
1884 /* If TC0 is invalidated disable TC mapping */
1885 if (tc->offset + tc->count > txq) {
7b6cd1ce 1886 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
4f57c087
JF
1887 dev->num_tc = 0;
1888 return;
1889 }
1890
1891 /* Invalidated prio to tc mappings set to TC0 */
1892 for (i = 1; i < TC_BITMASK + 1; i++) {
1893 int q = netdev_get_prio_tc_map(dev, i);
1894
1895 tc = &dev->tc_to_txq[q];
1896 if (tc->offset + tc->count > txq) {
7b6cd1ce
JP
1897 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1898 i, q);
4f57c087
JF
1899 netdev_set_prio_tc_map(dev, i, 0);
1900 }
1901 }
1902}
1903
537c00de
AD
1904#ifdef CONFIG_XPS
1905static DEFINE_MUTEX(xps_map_mutex);
1906#define xmap_dereference(P) \
1907 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1908
10cdc3f3
AD
1909static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1910 int cpu, u16 index)
537c00de 1911{
10cdc3f3
AD
1912 struct xps_map *map = NULL;
1913 int pos;
537c00de 1914
10cdc3f3
AD
1915 if (dev_maps)
1916 map = xmap_dereference(dev_maps->cpu_map[cpu]);
537c00de 1917
10cdc3f3
AD
1918 for (pos = 0; map && pos < map->len; pos++) {
1919 if (map->queues[pos] == index) {
537c00de
AD
1920 if (map->len > 1) {
1921 map->queues[pos] = map->queues[--map->len];
1922 } else {
10cdc3f3 1923 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
537c00de
AD
1924 kfree_rcu(map, rcu);
1925 map = NULL;
1926 }
10cdc3f3 1927 break;
537c00de 1928 }
537c00de
AD
1929 }
1930
10cdc3f3
AD
1931 return map;
1932}
1933
024e9679 1934static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
10cdc3f3
AD
1935{
1936 struct xps_dev_maps *dev_maps;
024e9679 1937 int cpu, i;
10cdc3f3
AD
1938 bool active = false;
1939
1940 mutex_lock(&xps_map_mutex);
1941 dev_maps = xmap_dereference(dev->xps_maps);
1942
1943 if (!dev_maps)
1944 goto out_no_maps;
1945
1946 for_each_possible_cpu(cpu) {
024e9679
AD
1947 for (i = index; i < dev->num_tx_queues; i++) {
1948 if (!remove_xps_queue(dev_maps, cpu, i))
1949 break;
1950 }
1951 if (i == dev->num_tx_queues)
10cdc3f3
AD
1952 active = true;
1953 }
1954
1955 if (!active) {
537c00de
AD
1956 RCU_INIT_POINTER(dev->xps_maps, NULL);
1957 kfree_rcu(dev_maps, rcu);
1958 }
1959
024e9679
AD
1960 for (i = index; i < dev->num_tx_queues; i++)
1961 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1962 NUMA_NO_NODE);
1963
537c00de
AD
1964out_no_maps:
1965 mutex_unlock(&xps_map_mutex);
1966}
1967
01c5f864
AD
1968static struct xps_map *expand_xps_map(struct xps_map *map,
1969 int cpu, u16 index)
1970{
1971 struct xps_map *new_map;
1972 int alloc_len = XPS_MIN_MAP_ALLOC;
1973 int i, pos;
1974
1975 for (pos = 0; map && pos < map->len; pos++) {
1976 if (map->queues[pos] != index)
1977 continue;
1978 return map;
1979 }
1980
1981 /* Need to add queue to this CPU's existing map */
1982 if (map) {
1983 if (pos < map->alloc_len)
1984 return map;
1985
1986 alloc_len = map->alloc_len * 2;
1987 }
1988
1989 /* Need to allocate new map to store queue on this CPU's map */
1990 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1991 cpu_to_node(cpu));
1992 if (!new_map)
1993 return NULL;
1994
1995 for (i = 0; i < pos; i++)
1996 new_map->queues[i] = map->queues[i];
1997 new_map->alloc_len = alloc_len;
1998 new_map->len = pos;
1999
2000 return new_map;
2001}
2002
3573540c
MT
2003int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2004 u16 index)
537c00de 2005{
01c5f864 2006 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
537c00de 2007 struct xps_map *map, *new_map;
537c00de 2008 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
01c5f864
AD
2009 int cpu, numa_node_id = -2;
2010 bool active = false;
537c00de
AD
2011
2012 mutex_lock(&xps_map_mutex);
2013
2014 dev_maps = xmap_dereference(dev->xps_maps);
2015
01c5f864
AD
2016 /* allocate memory for queue storage */
2017 for_each_online_cpu(cpu) {
2018 if (!cpumask_test_cpu(cpu, mask))
2019 continue;
2020
2021 if (!new_dev_maps)
2022 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2bb60cb9
AD
2023 if (!new_dev_maps) {
2024 mutex_unlock(&xps_map_mutex);
01c5f864 2025 return -ENOMEM;
2bb60cb9 2026 }
01c5f864
AD
2027
2028 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2029 NULL;
2030
2031 map = expand_xps_map(map, cpu, index);
2032 if (!map)
2033 goto error;
2034
2035 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2036 }
2037
2038 if (!new_dev_maps)
2039 goto out_no_new_maps;
2040
537c00de 2041 for_each_possible_cpu(cpu) {
01c5f864
AD
2042 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2043 /* add queue to CPU maps */
2044 int pos = 0;
2045
2046 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2047 while ((pos < map->len) && (map->queues[pos] != index))
2048 pos++;
2049
2050 if (pos == map->len)
2051 map->queues[map->len++] = index;
537c00de 2052#ifdef CONFIG_NUMA
537c00de
AD
2053 if (numa_node_id == -2)
2054 numa_node_id = cpu_to_node(cpu);
2055 else if (numa_node_id != cpu_to_node(cpu))
2056 numa_node_id = -1;
537c00de 2057#endif
01c5f864
AD
2058 } else if (dev_maps) {
2059 /* fill in the new device map from the old device map */
2060 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2061 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
537c00de 2062 }
01c5f864 2063
537c00de
AD
2064 }
2065
01c5f864
AD
2066 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2067
537c00de 2068 /* Cleanup old maps */
01c5f864
AD
2069 if (dev_maps) {
2070 for_each_possible_cpu(cpu) {
2071 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2072 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2073 if (map && map != new_map)
2074 kfree_rcu(map, rcu);
2075 }
537c00de 2076
01c5f864 2077 kfree_rcu(dev_maps, rcu);
537c00de
AD
2078 }
2079
01c5f864
AD
2080 dev_maps = new_dev_maps;
2081 active = true;
537c00de 2082
01c5f864
AD
2083out_no_new_maps:
2084 /* update Tx queue numa node */
537c00de
AD
2085 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2086 (numa_node_id >= 0) ? numa_node_id :
2087 NUMA_NO_NODE);
2088
01c5f864
AD
2089 if (!dev_maps)
2090 goto out_no_maps;
2091
2092 /* removes queue from unused CPUs */
2093 for_each_possible_cpu(cpu) {
2094 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2095 continue;
2096
2097 if (remove_xps_queue(dev_maps, cpu, index))
2098 active = true;
2099 }
2100
2101 /* free map if not active */
2102 if (!active) {
2103 RCU_INIT_POINTER(dev->xps_maps, NULL);
2104 kfree_rcu(dev_maps, rcu);
2105 }
2106
2107out_no_maps:
537c00de
AD
2108 mutex_unlock(&xps_map_mutex);
2109
2110 return 0;
2111error:
01c5f864
AD
2112 /* remove any maps that we added */
2113 for_each_possible_cpu(cpu) {
2114 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2115 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2116 NULL;
2117 if (new_map && new_map != map)
2118 kfree(new_map);
2119 }
2120
537c00de
AD
2121 mutex_unlock(&xps_map_mutex);
2122
537c00de
AD
2123 kfree(new_dev_maps);
2124 return -ENOMEM;
2125}
2126EXPORT_SYMBOL(netif_set_xps_queue);
2127
2128#endif
f0796d5c
JF
2129/*
2130 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2131 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2132 */
e6484930 2133int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
f0796d5c 2134{
1d24eb48
TH
2135 int rc;
2136
e6484930
TH
2137 if (txq < 1 || txq > dev->num_tx_queues)
2138 return -EINVAL;
f0796d5c 2139
5c56580b
BH
2140 if (dev->reg_state == NETREG_REGISTERED ||
2141 dev->reg_state == NETREG_UNREGISTERING) {
e6484930
TH
2142 ASSERT_RTNL();
2143
1d24eb48
TH
2144 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2145 txq);
bf264145
TH
2146 if (rc)
2147 return rc;
2148
4f57c087
JF
2149 if (dev->num_tc)
2150 netif_setup_tc(dev, txq);
2151
024e9679 2152 if (txq < dev->real_num_tx_queues) {
e6484930 2153 qdisc_reset_all_tx_gt(dev, txq);
024e9679
AD
2154#ifdef CONFIG_XPS
2155 netif_reset_xps_queues_gt(dev, txq);
2156#endif
2157 }
f0796d5c 2158 }
e6484930
TH
2159
2160 dev->real_num_tx_queues = txq;
2161 return 0;
f0796d5c
JF
2162}
2163EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 2164
a953be53 2165#ifdef CONFIG_SYSFS
62fe0b40
BH
2166/**
2167 * netif_set_real_num_rx_queues - set actual number of RX queues used
2168 * @dev: Network device
2169 * @rxq: Actual number of RX queues
2170 *
2171 * This must be called either with the rtnl_lock held or before
2172 * registration of the net device. Returns 0 on success, or a
4e7f7951
BH
2173 * negative error code. If called before registration, it always
2174 * succeeds.
62fe0b40
BH
2175 */
2176int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2177{
2178 int rc;
2179
bd25fa7b
TH
2180 if (rxq < 1 || rxq > dev->num_rx_queues)
2181 return -EINVAL;
2182
62fe0b40
BH
2183 if (dev->reg_state == NETREG_REGISTERED) {
2184 ASSERT_RTNL();
2185
62fe0b40
BH
2186 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2187 rxq);
2188 if (rc)
2189 return rc;
62fe0b40
BH
2190 }
2191
2192 dev->real_num_rx_queues = rxq;
2193 return 0;
2194}
2195EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2196#endif
2197
2c53040f
BH
2198/**
2199 * netif_get_num_default_rss_queues - default number of RSS queues
16917b87
YM
2200 *
2201 * This routine should set an upper limit on the number of RSS queues
2202 * used by default by multiqueue devices.
2203 */
a55b138b 2204int netif_get_num_default_rss_queues(void)
16917b87
YM
2205{
2206 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2207}
2208EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2209
def82a1d 2210static inline void __netif_reschedule(struct Qdisc *q)
56079431 2211{
def82a1d
JP
2212 struct softnet_data *sd;
2213 unsigned long flags;
56079431 2214
def82a1d 2215 local_irq_save(flags);
903ceff7 2216 sd = this_cpu_ptr(&softnet_data);
a9cbd588
CG
2217 q->next_sched = NULL;
2218 *sd->output_queue_tailp = q;
2219 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
2220 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2221 local_irq_restore(flags);
2222}
2223
2224void __netif_schedule(struct Qdisc *q)
2225{
2226 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2227 __netif_reschedule(q);
56079431
DV
2228}
2229EXPORT_SYMBOL(__netif_schedule);
2230
e6247027
ED
2231struct dev_kfree_skb_cb {
2232 enum skb_free_reason reason;
2233};
2234
2235static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
56079431 2236{
e6247027
ED
2237 return (struct dev_kfree_skb_cb *)skb->cb;
2238}
2239
46e5da40
JF
2240void netif_schedule_queue(struct netdev_queue *txq)
2241{
2242 rcu_read_lock();
2243 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2244 struct Qdisc *q = rcu_dereference(txq->qdisc);
2245
2246 __netif_schedule(q);
2247 }
2248 rcu_read_unlock();
2249}
2250EXPORT_SYMBOL(netif_schedule_queue);
2251
2252/**
2253 * netif_wake_subqueue - allow sending packets on subqueue
2254 * @dev: network device
2255 * @queue_index: sub queue index
2256 *
2257 * Resume individual transmit queue of a device with multiple transmit queues.
2258 */
2259void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2260{
2261 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2262
2263 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2264 struct Qdisc *q;
2265
2266 rcu_read_lock();
2267 q = rcu_dereference(txq->qdisc);
2268 __netif_schedule(q);
2269 rcu_read_unlock();
2270 }
2271}
2272EXPORT_SYMBOL(netif_wake_subqueue);
2273
2274void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2275{
2276 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2277 struct Qdisc *q;
2278
2279 rcu_read_lock();
2280 q = rcu_dereference(dev_queue->qdisc);
2281 __netif_schedule(q);
2282 rcu_read_unlock();
2283 }
2284}
2285EXPORT_SYMBOL(netif_tx_wake_queue);
2286
e6247027 2287void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
56079431 2288{
e6247027 2289 unsigned long flags;
56079431 2290
e6247027
ED
2291 if (likely(atomic_read(&skb->users) == 1)) {
2292 smp_rmb();
2293 atomic_set(&skb->users, 0);
2294 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2295 return;
bea3348e 2296 }
e6247027
ED
2297 get_kfree_skb_cb(skb)->reason = reason;
2298 local_irq_save(flags);
2299 skb->next = __this_cpu_read(softnet_data.completion_queue);
2300 __this_cpu_write(softnet_data.completion_queue, skb);
2301 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2302 local_irq_restore(flags);
56079431 2303}
e6247027 2304EXPORT_SYMBOL(__dev_kfree_skb_irq);
56079431 2305
e6247027 2306void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
56079431
DV
2307{
2308 if (in_irq() || irqs_disabled())
e6247027 2309 __dev_kfree_skb_irq(skb, reason);
56079431
DV
2310 else
2311 dev_kfree_skb(skb);
2312}
e6247027 2313EXPORT_SYMBOL(__dev_kfree_skb_any);
56079431
DV
2314
2315
bea3348e
SH
2316/**
2317 * netif_device_detach - mark device as removed
2318 * @dev: network device
2319 *
2320 * Mark device as removed from system and therefore no longer available.
2321 */
56079431
DV
2322void netif_device_detach(struct net_device *dev)
2323{
2324 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2325 netif_running(dev)) {
d543103a 2326 netif_tx_stop_all_queues(dev);
56079431
DV
2327 }
2328}
2329EXPORT_SYMBOL(netif_device_detach);
2330
bea3348e
SH
2331/**
2332 * netif_device_attach - mark device as attached
2333 * @dev: network device
2334 *
2335 * Mark device as attached from system and restart if needed.
2336 */
56079431
DV
2337void netif_device_attach(struct net_device *dev)
2338{
2339 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2340 netif_running(dev)) {
d543103a 2341 netif_tx_wake_all_queues(dev);
4ec93edb 2342 __netdev_watchdog_up(dev);
56079431
DV
2343 }
2344}
2345EXPORT_SYMBOL(netif_device_attach);
2346
5605c762
JP
2347/*
2348 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2349 * to be used as a distribution range.
2350 */
2351u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2352 unsigned int num_tx_queues)
2353{
2354 u32 hash;
2355 u16 qoffset = 0;
2356 u16 qcount = num_tx_queues;
2357
2358 if (skb_rx_queue_recorded(skb)) {
2359 hash = skb_get_rx_queue(skb);
2360 while (unlikely(hash >= num_tx_queues))
2361 hash -= num_tx_queues;
2362 return hash;
2363 }
2364
2365 if (dev->num_tc) {
2366 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2367 qoffset = dev->tc_to_txq[tc].offset;
2368 qcount = dev->tc_to_txq[tc].count;
2369 }
2370
2371 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2372}
2373EXPORT_SYMBOL(__skb_tx_hash);
2374
36c92474
BH
2375static void skb_warn_bad_offload(const struct sk_buff *skb)
2376{
65e9d2fa 2377 static const netdev_features_t null_features = 0;
36c92474
BH
2378 struct net_device *dev = skb->dev;
2379 const char *driver = "";
2380
c846ad9b
BG
2381 if (!net_ratelimit())
2382 return;
2383
36c92474
BH
2384 if (dev && dev->dev.parent)
2385 driver = dev_driver_string(dev->dev.parent);
2386
2387 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2388 "gso_type=%d ip_summed=%d\n",
65e9d2fa
MM
2389 driver, dev ? &dev->features : &null_features,
2390 skb->sk ? &skb->sk->sk_route_caps : &null_features,
36c92474
BH
2391 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2392 skb_shinfo(skb)->gso_type, skb->ip_summed);
2393}
2394
1da177e4
LT
2395/*
2396 * Invalidate hardware checksum when packet is to be mangled, and
2397 * complete checksum manually on outgoing path.
2398 */
84fa7933 2399int skb_checksum_help(struct sk_buff *skb)
1da177e4 2400{
d3bc23e7 2401 __wsum csum;
663ead3b 2402 int ret = 0, offset;
1da177e4 2403
84fa7933 2404 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
2405 goto out_set_summed;
2406
2407 if (unlikely(skb_shinfo(skb)->gso_size)) {
36c92474
BH
2408 skb_warn_bad_offload(skb);
2409 return -EINVAL;
1da177e4
LT
2410 }
2411
cef401de
ED
2412 /* Before computing a checksum, we should make sure no frag could
2413 * be modified by an external entity : checksum could be wrong.
2414 */
2415 if (skb_has_shared_frag(skb)) {
2416 ret = __skb_linearize(skb);
2417 if (ret)
2418 goto out;
2419 }
2420
55508d60 2421 offset = skb_checksum_start_offset(skb);
a030847e
HX
2422 BUG_ON(offset >= skb_headlen(skb));
2423 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2424
2425 offset += skb->csum_offset;
2426 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2427
2428 if (skb_cloned(skb) &&
2429 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
2430 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2431 if (ret)
2432 goto out;
2433 }
2434
a030847e 2435 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 2436out_set_summed:
1da177e4 2437 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 2438out:
1da177e4
LT
2439 return ret;
2440}
d1b19dff 2441EXPORT_SYMBOL(skb_checksum_help);
1da177e4 2442
53d6471c 2443__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
f6a78bfc 2444{
252e3346 2445 __be16 type = skb->protocol;
f6a78bfc 2446
19acc327
PS
2447 /* Tunnel gso handlers can set protocol to ethernet. */
2448 if (type == htons(ETH_P_TEB)) {
2449 struct ethhdr *eth;
2450
2451 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2452 return 0;
2453
2454 eth = (struct ethhdr *)skb_mac_header(skb);
2455 type = eth->h_proto;
2456 }
2457
d4bcef3f 2458 return __vlan_get_protocol(skb, type, depth);
ec5f0615
PS
2459}
2460
2461/**
2462 * skb_mac_gso_segment - mac layer segmentation handler.
2463 * @skb: buffer to segment
2464 * @features: features for the output path (see dev->features)
2465 */
2466struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2467 netdev_features_t features)
2468{
2469 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2470 struct packet_offload *ptype;
53d6471c
VY
2471 int vlan_depth = skb->mac_len;
2472 __be16 type = skb_network_protocol(skb, &vlan_depth);
ec5f0615
PS
2473
2474 if (unlikely(!type))
2475 return ERR_PTR(-EINVAL);
2476
53d6471c 2477 __skb_pull(skb, vlan_depth);
f6a78bfc
HX
2478
2479 rcu_read_lock();
22061d80 2480 list_for_each_entry_rcu(ptype, &offload_base, list) {
f191a1d1 2481 if (ptype->type == type && ptype->callbacks.gso_segment) {
f191a1d1 2482 segs = ptype->callbacks.gso_segment(skb, features);
f6a78bfc
HX
2483 break;
2484 }
2485 }
2486 rcu_read_unlock();
2487
98e399f8 2488 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 2489
f6a78bfc
HX
2490 return segs;
2491}
05e8ef4a
PS
2492EXPORT_SYMBOL(skb_mac_gso_segment);
2493
2494
2495/* openvswitch calls this on rx path, so we need a different check.
2496 */
2497static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2498{
2499 if (tx_path)
2500 return skb->ip_summed != CHECKSUM_PARTIAL;
2501 else
2502 return skb->ip_summed == CHECKSUM_NONE;
2503}
2504
2505/**
2506 * __skb_gso_segment - Perform segmentation on skb.
2507 * @skb: buffer to segment
2508 * @features: features for the output path (see dev->features)
2509 * @tx_path: whether it is called in TX path
2510 *
2511 * This function segments the given skb and returns a list of segments.
2512 *
2513 * It may return NULL if the skb requires no segmentation. This is
2514 * only possible when GSO is used for verifying header integrity.
2515 */
2516struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2517 netdev_features_t features, bool tx_path)
2518{
2519 if (unlikely(skb_needs_check(skb, tx_path))) {
2520 int err;
2521
2522 skb_warn_bad_offload(skb);
2523
a40e0a66 2524 err = skb_cow_head(skb, 0);
2525 if (err < 0)
05e8ef4a
PS
2526 return ERR_PTR(err);
2527 }
2528
68c33163 2529 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3347c960
ED
2530 SKB_GSO_CB(skb)->encap_level = 0;
2531
05e8ef4a
PS
2532 skb_reset_mac_header(skb);
2533 skb_reset_mac_len(skb);
2534
2535 return skb_mac_gso_segment(skb, features);
2536}
12b0004d 2537EXPORT_SYMBOL(__skb_gso_segment);
f6a78bfc 2538
fb286bb2
HX
2539/* Take action when hardware reception checksum errors are detected. */
2540#ifdef CONFIG_BUG
2541void netdev_rx_csum_fault(struct net_device *dev)
2542{
2543 if (net_ratelimit()) {
7b6cd1ce 2544 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
fb286bb2
HX
2545 dump_stack();
2546 }
2547}
2548EXPORT_SYMBOL(netdev_rx_csum_fault);
2549#endif
2550
1da177e4
LT
2551/* Actually, we should eliminate this check as soon as we know, that:
2552 * 1. IOMMU is present and allows to map all the memory.
2553 * 2. No high memory really exists on this machine.
2554 */
2555
c1e756bf 2556static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 2557{
3d3a8533 2558#ifdef CONFIG_HIGHMEM
1da177e4 2559 int i;
5acbbd42 2560 if (!(dev->features & NETIF_F_HIGHDMA)) {
ea2ab693
IC
2561 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2562 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2563 if (PageHighMem(skb_frag_page(frag)))
5acbbd42 2564 return 1;
ea2ab693 2565 }
5acbbd42 2566 }
1da177e4 2567
5acbbd42
FT
2568 if (PCI_DMA_BUS_IS_PHYS) {
2569 struct device *pdev = dev->dev.parent;
1da177e4 2570
9092c658
ED
2571 if (!pdev)
2572 return 0;
5acbbd42 2573 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
ea2ab693
IC
2574 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2575 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
5acbbd42
FT
2576 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2577 return 1;
2578 }
2579 }
3d3a8533 2580#endif
1da177e4
LT
2581 return 0;
2582}
1da177e4 2583
3b392ddb
SH
2584/* If MPLS offload request, verify we are testing hardware MPLS features
2585 * instead of standard features for the netdev.
2586 */
d0edc7bf 2587#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3b392ddb
SH
2588static netdev_features_t net_mpls_features(struct sk_buff *skb,
2589 netdev_features_t features,
2590 __be16 type)
2591{
25cd9ba0 2592 if (eth_p_mpls(type))
3b392ddb
SH
2593 features &= skb->dev->mpls_features;
2594
2595 return features;
2596}
2597#else
2598static netdev_features_t net_mpls_features(struct sk_buff *skb,
2599 netdev_features_t features,
2600 __be16 type)
2601{
2602 return features;
2603}
2604#endif
2605
c8f44aff 2606static netdev_features_t harmonize_features(struct sk_buff *skb,
c1e756bf 2607 netdev_features_t features)
f01a5236 2608{
53d6471c 2609 int tmp;
3b392ddb
SH
2610 __be16 type;
2611
2612 type = skb_network_protocol(skb, &tmp);
2613 features = net_mpls_features(skb, features, type);
53d6471c 2614
c0d680e5 2615 if (skb->ip_summed != CHECKSUM_NONE &&
3b392ddb 2616 !can_checksum_protocol(features, type)) {
f01a5236 2617 features &= ~NETIF_F_ALL_CSUM;
c1e756bf 2618 } else if (illegal_highdma(skb->dev, skb)) {
f01a5236
JG
2619 features &= ~NETIF_F_SG;
2620 }
2621
2622 return features;
2623}
2624
e38f3025
TM
2625netdev_features_t passthru_features_check(struct sk_buff *skb,
2626 struct net_device *dev,
2627 netdev_features_t features)
2628{
2629 return features;
2630}
2631EXPORT_SYMBOL(passthru_features_check);
2632
8cb65d00
TM
2633static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2634 struct net_device *dev,
2635 netdev_features_t features)
2636{
2637 return vlan_features_check(skb, features);
2638}
2639
c1e756bf 2640netdev_features_t netif_skb_features(struct sk_buff *skb)
58e998c6 2641{
5f35227e 2642 struct net_device *dev = skb->dev;
fcbeb976
ED
2643 netdev_features_t features = dev->features;
2644 u16 gso_segs = skb_shinfo(skb)->gso_segs;
58e998c6 2645
fcbeb976 2646 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
30b678d8
BH
2647 features &= ~NETIF_F_GSO_MASK;
2648
5f35227e
JG
2649 /* If encapsulation offload request, verify we are testing
2650 * hardware encapsulation features instead of standard
2651 * features for the netdev
2652 */
2653 if (skb->encapsulation)
2654 features &= dev->hw_enc_features;
2655
f5a7fb88
TM
2656 if (skb_vlan_tagged(skb))
2657 features = netdev_intersect_features(features,
2658 dev->vlan_features |
2659 NETIF_F_HW_VLAN_CTAG_TX |
2660 NETIF_F_HW_VLAN_STAG_TX);
f01a5236 2661
5f35227e
JG
2662 if (dev->netdev_ops->ndo_features_check)
2663 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2664 features);
8cb65d00
TM
2665 else
2666 features &= dflt_features_check(skb, dev, features);
5f35227e 2667
c1e756bf 2668 return harmonize_features(skb, features);
58e998c6 2669}
c1e756bf 2670EXPORT_SYMBOL(netif_skb_features);
58e998c6 2671
2ea25513 2672static int xmit_one(struct sk_buff *skb, struct net_device *dev,
95f6b3dd 2673 struct netdev_queue *txq, bool more)
f6a78bfc 2674{
2ea25513
DM
2675 unsigned int len;
2676 int rc;
00829823 2677
7866a621 2678 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2ea25513 2679 dev_queue_xmit_nit(skb, dev);
fc741216 2680
2ea25513
DM
2681 len = skb->len;
2682 trace_net_dev_start_xmit(skb, dev);
95f6b3dd 2683 rc = netdev_start_xmit(skb, dev, txq, more);
2ea25513 2684 trace_net_dev_xmit(skb, rc, dev, len);
adf30907 2685
2ea25513
DM
2686 return rc;
2687}
7b9c6090 2688
8dcda22a
DM
2689struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2690 struct netdev_queue *txq, int *ret)
7f2e870f
DM
2691{
2692 struct sk_buff *skb = first;
2693 int rc = NETDEV_TX_OK;
7b9c6090 2694
7f2e870f
DM
2695 while (skb) {
2696 struct sk_buff *next = skb->next;
fc70fb64 2697
7f2e870f 2698 skb->next = NULL;
95f6b3dd 2699 rc = xmit_one(skb, dev, txq, next != NULL);
7f2e870f
DM
2700 if (unlikely(!dev_xmit_complete(rc))) {
2701 skb->next = next;
2702 goto out;
2703 }
6afff0ca 2704
7f2e870f
DM
2705 skb = next;
2706 if (netif_xmit_stopped(txq) && skb) {
2707 rc = NETDEV_TX_BUSY;
2708 break;
9ccb8975 2709 }
7f2e870f 2710 }
9ccb8975 2711
7f2e870f
DM
2712out:
2713 *ret = rc;
2714 return skb;
2715}
b40863c6 2716
1ff0dc94
ED
2717static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2718 netdev_features_t features)
f6a78bfc 2719{
df8a39de 2720 if (skb_vlan_tag_present(skb) &&
5968250c
JP
2721 !vlan_hw_offload_capable(features, skb->vlan_proto))
2722 skb = __vlan_hwaccel_push_inside(skb);
eae3f88e
DM
2723 return skb;
2724}
f6a78bfc 2725
55a93b3e 2726static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
eae3f88e
DM
2727{
2728 netdev_features_t features;
f6a78bfc 2729
eae3f88e
DM
2730 if (skb->next)
2731 return skb;
068a2de5 2732
eae3f88e
DM
2733 features = netif_skb_features(skb);
2734 skb = validate_xmit_vlan(skb, features);
2735 if (unlikely(!skb))
2736 goto out_null;
7b9c6090 2737
8b86a61d 2738 if (netif_needs_gso(skb, features)) {
ce93718f
DM
2739 struct sk_buff *segs;
2740
2741 segs = skb_gso_segment(skb, features);
cecda693 2742 if (IS_ERR(segs)) {
af6dabc9 2743 goto out_kfree_skb;
cecda693
JW
2744 } else if (segs) {
2745 consume_skb(skb);
2746 skb = segs;
f6a78bfc 2747 }
eae3f88e
DM
2748 } else {
2749 if (skb_needs_linearize(skb, features) &&
2750 __skb_linearize(skb))
2751 goto out_kfree_skb;
4ec93edb 2752
eae3f88e
DM
2753 /* If packet is not checksummed and device does not
2754 * support checksumming for this protocol, complete
2755 * checksumming here.
2756 */
2757 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2758 if (skb->encapsulation)
2759 skb_set_inner_transport_header(skb,
2760 skb_checksum_start_offset(skb));
2761 else
2762 skb_set_transport_header(skb,
2763 skb_checksum_start_offset(skb));
2764 if (!(features & NETIF_F_ALL_CSUM) &&
2765 skb_checksum_help(skb))
2766 goto out_kfree_skb;
7b9c6090 2767 }
0c772159 2768 }
7b9c6090 2769
eae3f88e 2770 return skb;
fc70fb64 2771
f6a78bfc
HX
2772out_kfree_skb:
2773 kfree_skb(skb);
eae3f88e
DM
2774out_null:
2775 return NULL;
2776}
6afff0ca 2777
55a93b3e
ED
2778struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2779{
2780 struct sk_buff *next, *head = NULL, *tail;
2781
bec3cfdc 2782 for (; skb != NULL; skb = next) {
55a93b3e
ED
2783 next = skb->next;
2784 skb->next = NULL;
bec3cfdc
ED
2785
2786 /* in case skb wont be segmented, point to itself */
2787 skb->prev = skb;
2788
55a93b3e 2789 skb = validate_xmit_skb(skb, dev);
bec3cfdc
ED
2790 if (!skb)
2791 continue;
55a93b3e 2792
bec3cfdc
ED
2793 if (!head)
2794 head = skb;
2795 else
2796 tail->next = skb;
2797 /* If skb was segmented, skb->prev points to
2798 * the last segment. If not, it still contains skb.
2799 */
2800 tail = skb->prev;
55a93b3e
ED
2801 }
2802 return head;
f6a78bfc
HX
2803}
2804
1def9238
ED
2805static void qdisc_pkt_len_init(struct sk_buff *skb)
2806{
2807 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2808
2809 qdisc_skb_cb(skb)->pkt_len = skb->len;
2810
2811 /* To get more precise estimation of bytes sent on wire,
2812 * we add to pkt_len the headers size of all segments
2813 */
2814 if (shinfo->gso_size) {
757b8b1d 2815 unsigned int hdr_len;
15e5a030 2816 u16 gso_segs = shinfo->gso_segs;
1def9238 2817
757b8b1d
ED
2818 /* mac layer + network layer */
2819 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2820
2821 /* + transport layer */
1def9238
ED
2822 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2823 hdr_len += tcp_hdrlen(skb);
2824 else
2825 hdr_len += sizeof(struct udphdr);
15e5a030
JW
2826
2827 if (shinfo->gso_type & SKB_GSO_DODGY)
2828 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2829 shinfo->gso_size);
2830
2831 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
1def9238
ED
2832 }
2833}
2834
bbd8a0d3
KK
2835static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2836 struct net_device *dev,
2837 struct netdev_queue *txq)
2838{
2839 spinlock_t *root_lock = qdisc_lock(q);
a2da570d 2840 bool contended;
bbd8a0d3
KK
2841 int rc;
2842
1def9238 2843 qdisc_pkt_len_init(skb);
a2da570d 2844 qdisc_calculate_pkt_len(skb, q);
79640a4c
ED
2845 /*
2846 * Heuristic to force contended enqueues to serialize on a
2847 * separate lock before trying to get qdisc main lock.
9bf2b8c2
YX
2848 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2849 * often and dequeue packets faster.
79640a4c 2850 */
a2da570d 2851 contended = qdisc_is_running(q);
79640a4c
ED
2852 if (unlikely(contended))
2853 spin_lock(&q->busylock);
2854
bbd8a0d3
KK
2855 spin_lock(root_lock);
2856 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2857 kfree_skb(skb);
2858 rc = NET_XMIT_DROP;
2859 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 2860 qdisc_run_begin(q)) {
bbd8a0d3
KK
2861 /*
2862 * This is a work-conserving queue; there are no old skbs
2863 * waiting to be sent out; and the qdisc is not running -
2864 * xmit the skb directly.
2865 */
bfe0d029 2866
bfe0d029
ED
2867 qdisc_bstats_update(q, skb);
2868
55a93b3e 2869 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
79640a4c
ED
2870 if (unlikely(contended)) {
2871 spin_unlock(&q->busylock);
2872 contended = false;
2873 }
bbd8a0d3 2874 __qdisc_run(q);
79640a4c 2875 } else
bc135b23 2876 qdisc_run_end(q);
bbd8a0d3
KK
2877
2878 rc = NET_XMIT_SUCCESS;
2879 } else {
a2da570d 2880 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
79640a4c
ED
2881 if (qdisc_run_begin(q)) {
2882 if (unlikely(contended)) {
2883 spin_unlock(&q->busylock);
2884 contended = false;
2885 }
2886 __qdisc_run(q);
2887 }
bbd8a0d3
KK
2888 }
2889 spin_unlock(root_lock);
79640a4c
ED
2890 if (unlikely(contended))
2891 spin_unlock(&q->busylock);
bbd8a0d3
KK
2892 return rc;
2893}
2894
86f8515f 2895#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
5bc1421e
NH
2896static void skb_update_prio(struct sk_buff *skb)
2897{
6977a79d 2898 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
5bc1421e 2899
91c68ce2
ED
2900 if (!skb->priority && skb->sk && map) {
2901 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2902
2903 if (prioidx < map->priomap_len)
2904 skb->priority = map->priomap[prioidx];
2905 }
5bc1421e
NH
2906}
2907#else
2908#define skb_update_prio(skb)
2909#endif
2910
f60e5990 2911DEFINE_PER_CPU(int, xmit_recursion);
2912EXPORT_SYMBOL(xmit_recursion);
2913
11a766ce 2914#define RECURSION_LIMIT 10
745e20f1 2915
95603e22
MM
2916/**
2917 * dev_loopback_xmit - loop back @skb
2918 * @skb: buffer to transmit
2919 */
7026b1dd 2920int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
95603e22
MM
2921{
2922 skb_reset_mac_header(skb);
2923 __skb_pull(skb, skb_network_offset(skb));
2924 skb->pkt_type = PACKET_LOOPBACK;
2925 skb->ip_summed = CHECKSUM_UNNECESSARY;
2926 WARN_ON(!skb_dst(skb));
2927 skb_dst_force(skb);
2928 netif_rx_ni(skb);
2929 return 0;
2930}
2931EXPORT_SYMBOL(dev_loopback_xmit);
2932
638b2a69
JP
2933static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2934{
2935#ifdef CONFIG_XPS
2936 struct xps_dev_maps *dev_maps;
2937 struct xps_map *map;
2938 int queue_index = -1;
2939
2940 rcu_read_lock();
2941 dev_maps = rcu_dereference(dev->xps_maps);
2942 if (dev_maps) {
2943 map = rcu_dereference(
2944 dev_maps->cpu_map[skb->sender_cpu - 1]);
2945 if (map) {
2946 if (map->len == 1)
2947 queue_index = map->queues[0];
2948 else
2949 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
2950 map->len)];
2951 if (unlikely(queue_index >= dev->real_num_tx_queues))
2952 queue_index = -1;
2953 }
2954 }
2955 rcu_read_unlock();
2956
2957 return queue_index;
2958#else
2959 return -1;
2960#endif
2961}
2962
2963static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
2964{
2965 struct sock *sk = skb->sk;
2966 int queue_index = sk_tx_queue_get(sk);
2967
2968 if (queue_index < 0 || skb->ooo_okay ||
2969 queue_index >= dev->real_num_tx_queues) {
2970 int new_index = get_xps_queue(dev, skb);
2971 if (new_index < 0)
2972 new_index = skb_tx_hash(dev, skb);
2973
2974 if (queue_index != new_index && sk &&
2975 rcu_access_pointer(sk->sk_dst_cache))
2976 sk_tx_queue_set(sk, new_index);
2977
2978 queue_index = new_index;
2979 }
2980
2981 return queue_index;
2982}
2983
2984struct netdev_queue *netdev_pick_tx(struct net_device *dev,
2985 struct sk_buff *skb,
2986 void *accel_priv)
2987{
2988 int queue_index = 0;
2989
2990#ifdef CONFIG_XPS
2991 if (skb->sender_cpu == 0)
2992 skb->sender_cpu = raw_smp_processor_id() + 1;
2993#endif
2994
2995 if (dev->real_num_tx_queues != 1) {
2996 const struct net_device_ops *ops = dev->netdev_ops;
2997 if (ops->ndo_select_queue)
2998 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
2999 __netdev_pick_tx);
3000 else
3001 queue_index = __netdev_pick_tx(dev, skb);
3002
3003 if (!accel_priv)
3004 queue_index = netdev_cap_txqueue(dev, queue_index);
3005 }
3006
3007 skb_set_queue_mapping(skb, queue_index);
3008 return netdev_get_tx_queue(dev, queue_index);
3009}
3010
d29f749e 3011/**
9d08dd3d 3012 * __dev_queue_xmit - transmit a buffer
d29f749e 3013 * @skb: buffer to transmit
9d08dd3d 3014 * @accel_priv: private data used for L2 forwarding offload
d29f749e
DJ
3015 *
3016 * Queue a buffer for transmission to a network device. The caller must
3017 * have set the device and priority and built the buffer before calling
3018 * this function. The function can be called from an interrupt.
3019 *
3020 * A negative errno code is returned on a failure. A success does not
3021 * guarantee the frame will be transmitted as it may be dropped due
3022 * to congestion or traffic shaping.
3023 *
3024 * -----------------------------------------------------------------------------------
3025 * I notice this method can also return errors from the queue disciplines,
3026 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3027 * be positive.
3028 *
3029 * Regardless of the return value, the skb is consumed, so it is currently
3030 * difficult to retry a send to this method. (You can bump the ref count
3031 * before sending to hold a reference for retry if you are careful.)
3032 *
3033 * When calling this method, interrupts MUST be enabled. This is because
3034 * the BH enable code must have IRQs enabled so that it will not deadlock.
3035 * --BLG
3036 */
0a59f3a9 3037static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
1da177e4
LT
3038{
3039 struct net_device *dev = skb->dev;
dc2b4847 3040 struct netdev_queue *txq;
1da177e4
LT
3041 struct Qdisc *q;
3042 int rc = -ENOMEM;
3043
6d1ccff6
ED
3044 skb_reset_mac_header(skb);
3045
e7fd2885
WB
3046 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3047 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3048
4ec93edb
YH
3049 /* Disable soft irqs for various locks below. Also
3050 * stops preemption for RCU.
1da177e4 3051 */
4ec93edb 3052 rcu_read_lock_bh();
1da177e4 3053
5bc1421e
NH
3054 skb_update_prio(skb);
3055
02875878
ED
3056 /* If device/qdisc don't need skb->dst, release it right now while
3057 * its hot in this cpu cache.
3058 */
3059 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3060 skb_dst_drop(skb);
3061 else
3062 skb_dst_force(skb);
3063
f663dd9a 3064 txq = netdev_pick_tx(dev, skb, accel_priv);
a898def2 3065 q = rcu_dereference_bh(txq->qdisc);
37437bb2 3066
1da177e4 3067#ifdef CONFIG_NET_CLS_ACT
d1b19dff 3068 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4 3069#endif
cf66ba58 3070 trace_net_dev_queue(skb);
1da177e4 3071 if (q->enqueue) {
bbd8a0d3 3072 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 3073 goto out;
1da177e4
LT
3074 }
3075
3076 /* The device has no queue. Common case for software devices:
3077 loopback, all the sorts of tunnels...
3078
932ff279
HX
3079 Really, it is unlikely that netif_tx_lock protection is necessary
3080 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
3081 counters.)
3082 However, it is possible, that they rely on protection
3083 made by us here.
3084
3085 Check this and shot the lock. It is not prone from deadlocks.
3086 Either shot noqueue qdisc, it is even simpler 8)
3087 */
3088 if (dev->flags & IFF_UP) {
3089 int cpu = smp_processor_id(); /* ok because BHs are off */
3090
c773e847 3091 if (txq->xmit_lock_owner != cpu) {
1da177e4 3092
745e20f1
ED
3093 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3094 goto recursion_alert;
3095
1f59533f
JDB
3096 skb = validate_xmit_skb(skb, dev);
3097 if (!skb)
3098 goto drop;
3099
c773e847 3100 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 3101
73466498 3102 if (!netif_xmit_stopped(txq)) {
745e20f1 3103 __this_cpu_inc(xmit_recursion);
ce93718f 3104 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
745e20f1 3105 __this_cpu_dec(xmit_recursion);
572a9d7b 3106 if (dev_xmit_complete(rc)) {
c773e847 3107 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
3108 goto out;
3109 }
3110 }
c773e847 3111 HARD_TX_UNLOCK(dev, txq);
e87cc472
JP
3112 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3113 dev->name);
1da177e4
LT
3114 } else {
3115 /* Recursion is detected! It is possible,
745e20f1
ED
3116 * unfortunately
3117 */
3118recursion_alert:
e87cc472
JP
3119 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3120 dev->name);
1da177e4
LT
3121 }
3122 }
3123
3124 rc = -ENETDOWN;
1f59533f 3125drop:
d4828d85 3126 rcu_read_unlock_bh();
1da177e4 3127
015f0688 3128 atomic_long_inc(&dev->tx_dropped);
1f59533f 3129 kfree_skb_list(skb);
1da177e4
LT
3130 return rc;
3131out:
d4828d85 3132 rcu_read_unlock_bh();
1da177e4
LT
3133 return rc;
3134}
f663dd9a 3135
7026b1dd 3136int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
f663dd9a
JW
3137{
3138 return __dev_queue_xmit(skb, NULL);
3139}
7026b1dd 3140EXPORT_SYMBOL(dev_queue_xmit_sk);
1da177e4 3141
f663dd9a
JW
3142int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3143{
3144 return __dev_queue_xmit(skb, accel_priv);
3145}
3146EXPORT_SYMBOL(dev_queue_xmit_accel);
3147
1da177e4
LT
3148
3149/*=======================================================================
3150 Receiver routines
3151 =======================================================================*/
3152
6b2bedc3 3153int netdev_max_backlog __read_mostly = 1000;
c9e6bc64
ED
3154EXPORT_SYMBOL(netdev_max_backlog);
3155
3b098e2d 3156int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
3157int netdev_budget __read_mostly = 300;
3158int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 3159
eecfd7c4
ED
3160/* Called with irq disabled */
3161static inline void ____napi_schedule(struct softnet_data *sd,
3162 struct napi_struct *napi)
3163{
3164 list_add_tail(&napi->poll_list, &sd->poll_list);
3165 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3166}
3167
bfb564e7
KK
3168#ifdef CONFIG_RPS
3169
3170/* One global table that all flow-based protocols share. */
6e3f7faf 3171struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
bfb564e7 3172EXPORT_SYMBOL(rps_sock_flow_table);
567e4b79
ED
3173u32 rps_cpu_mask __read_mostly;
3174EXPORT_SYMBOL(rps_cpu_mask);
bfb564e7 3175
c5905afb 3176struct static_key rps_needed __read_mostly;
adc9300e 3177
c445477d
BH
3178static struct rps_dev_flow *
3179set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3180 struct rps_dev_flow *rflow, u16 next_cpu)
3181{
a31196b0 3182 if (next_cpu < nr_cpu_ids) {
c445477d
BH
3183#ifdef CONFIG_RFS_ACCEL
3184 struct netdev_rx_queue *rxqueue;
3185 struct rps_dev_flow_table *flow_table;
3186 struct rps_dev_flow *old_rflow;
3187 u32 flow_id;
3188 u16 rxq_index;
3189 int rc;
3190
3191 /* Should we steer this flow to a different hardware queue? */
69a19ee6
BH
3192 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3193 !(dev->features & NETIF_F_NTUPLE))
c445477d
BH
3194 goto out;
3195 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3196 if (rxq_index == skb_get_rx_queue(skb))
3197 goto out;
3198
3199 rxqueue = dev->_rx + rxq_index;
3200 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3201 if (!flow_table)
3202 goto out;
61b905da 3203 flow_id = skb_get_hash(skb) & flow_table->mask;
c445477d
BH
3204 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3205 rxq_index, flow_id);
3206 if (rc < 0)
3207 goto out;
3208 old_rflow = rflow;
3209 rflow = &flow_table->flows[flow_id];
c445477d
BH
3210 rflow->filter = rc;
3211 if (old_rflow->filter == rflow->filter)
3212 old_rflow->filter = RPS_NO_FILTER;
3213 out:
3214#endif
3215 rflow->last_qtail =
09994d1b 3216 per_cpu(softnet_data, next_cpu).input_queue_head;
c445477d
BH
3217 }
3218
09994d1b 3219 rflow->cpu = next_cpu;
c445477d
BH
3220 return rflow;
3221}
3222
bfb564e7
KK
3223/*
3224 * get_rps_cpu is called from netif_receive_skb and returns the target
3225 * CPU from the RPS map of the receiving queue for a given skb.
3226 * rcu_read_lock must be held on entry.
3227 */
3228static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3229 struct rps_dev_flow **rflowp)
3230{
567e4b79
ED
3231 const struct rps_sock_flow_table *sock_flow_table;
3232 struct netdev_rx_queue *rxqueue = dev->_rx;
bfb564e7 3233 struct rps_dev_flow_table *flow_table;
567e4b79 3234 struct rps_map *map;
bfb564e7 3235 int cpu = -1;
567e4b79 3236 u32 tcpu;
61b905da 3237 u32 hash;
bfb564e7
KK
3238
3239 if (skb_rx_queue_recorded(skb)) {
3240 u16 index = skb_get_rx_queue(skb);
567e4b79 3241
62fe0b40
BH
3242 if (unlikely(index >= dev->real_num_rx_queues)) {
3243 WARN_ONCE(dev->real_num_rx_queues > 1,
3244 "%s received packet on queue %u, but number "
3245 "of RX queues is %u\n",
3246 dev->name, index, dev->real_num_rx_queues);
bfb564e7
KK
3247 goto done;
3248 }
567e4b79
ED
3249 rxqueue += index;
3250 }
bfb564e7 3251
567e4b79
ED
3252 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3253
3254 flow_table = rcu_dereference(rxqueue->rps_flow_table);
6e3f7faf 3255 map = rcu_dereference(rxqueue->rps_map);
567e4b79 3256 if (!flow_table && !map)
bfb564e7
KK
3257 goto done;
3258
2d47b459 3259 skb_reset_network_header(skb);
61b905da
TH
3260 hash = skb_get_hash(skb);
3261 if (!hash)
bfb564e7
KK
3262 goto done;
3263
fec5e652
TH
3264 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3265 if (flow_table && sock_flow_table) {
fec5e652 3266 struct rps_dev_flow *rflow;
567e4b79
ED
3267 u32 next_cpu;
3268 u32 ident;
3269
3270 /* First check into global flow table if there is a match */
3271 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3272 if ((ident ^ hash) & ~rps_cpu_mask)
3273 goto try_rps;
fec5e652 3274
567e4b79
ED
3275 next_cpu = ident & rps_cpu_mask;
3276
3277 /* OK, now we know there is a match,
3278 * we can look at the local (per receive queue) flow table
3279 */
61b905da 3280 rflow = &flow_table->flows[hash & flow_table->mask];
fec5e652
TH
3281 tcpu = rflow->cpu;
3282
fec5e652
TH
3283 /*
3284 * If the desired CPU (where last recvmsg was done) is
3285 * different from current CPU (one in the rx-queue flow
3286 * table entry), switch if one of the following holds:
a31196b0 3287 * - Current CPU is unset (>= nr_cpu_ids).
fec5e652
TH
3288 * - Current CPU is offline.
3289 * - The current CPU's queue tail has advanced beyond the
3290 * last packet that was enqueued using this table entry.
3291 * This guarantees that all previous packets for the flow
3292 * have been dequeued, thus preserving in order delivery.
3293 */
3294 if (unlikely(tcpu != next_cpu) &&
a31196b0 3295 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
fec5e652 3296 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
baefa31d
TH
3297 rflow->last_qtail)) >= 0)) {
3298 tcpu = next_cpu;
c445477d 3299 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
baefa31d 3300 }
c445477d 3301
a31196b0 3302 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
fec5e652
TH
3303 *rflowp = rflow;
3304 cpu = tcpu;
3305 goto done;
3306 }
3307 }
3308
567e4b79
ED
3309try_rps:
3310
0a9627f2 3311 if (map) {
8fc54f68 3312 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
0a9627f2
TH
3313 if (cpu_online(tcpu)) {
3314 cpu = tcpu;
3315 goto done;
3316 }
3317 }
3318
3319done:
0a9627f2
TH
3320 return cpu;
3321}
3322
c445477d
BH
3323#ifdef CONFIG_RFS_ACCEL
3324
3325/**
3326 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3327 * @dev: Device on which the filter was set
3328 * @rxq_index: RX queue index
3329 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3330 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3331 *
3332 * Drivers that implement ndo_rx_flow_steer() should periodically call
3333 * this function for each installed filter and remove the filters for
3334 * which it returns %true.
3335 */
3336bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3337 u32 flow_id, u16 filter_id)
3338{
3339 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3340 struct rps_dev_flow_table *flow_table;
3341 struct rps_dev_flow *rflow;
3342 bool expire = true;
a31196b0 3343 unsigned int cpu;
c445477d
BH
3344
3345 rcu_read_lock();
3346 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3347 if (flow_table && flow_id <= flow_table->mask) {
3348 rflow = &flow_table->flows[flow_id];
3349 cpu = ACCESS_ONCE(rflow->cpu);
a31196b0 3350 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
c445477d
BH
3351 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3352 rflow->last_qtail) <
3353 (int)(10 * flow_table->mask)))
3354 expire = false;
3355 }
3356 rcu_read_unlock();
3357 return expire;
3358}
3359EXPORT_SYMBOL(rps_may_expire_flow);
3360
3361#endif /* CONFIG_RFS_ACCEL */
3362
0a9627f2 3363/* Called from hardirq (IPI) context */
e36fa2f7 3364static void rps_trigger_softirq(void *data)
0a9627f2 3365{
e36fa2f7
ED
3366 struct softnet_data *sd = data;
3367
eecfd7c4 3368 ____napi_schedule(sd, &sd->backlog);
dee42870 3369 sd->received_rps++;
0a9627f2 3370}
e36fa2f7 3371
fec5e652 3372#endif /* CONFIG_RPS */
0a9627f2 3373
e36fa2f7
ED
3374/*
3375 * Check if this softnet_data structure is another cpu one
3376 * If yes, queue it to our IPI list and return 1
3377 * If no, return 0
3378 */
3379static int rps_ipi_queued(struct softnet_data *sd)
3380{
3381#ifdef CONFIG_RPS
903ceff7 3382 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
e36fa2f7
ED
3383
3384 if (sd != mysd) {
3385 sd->rps_ipi_next = mysd->rps_ipi_list;
3386 mysd->rps_ipi_list = sd;
3387
3388 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3389 return 1;
3390 }
3391#endif /* CONFIG_RPS */
3392 return 0;
3393}
3394
99bbc707
WB
3395#ifdef CONFIG_NET_FLOW_LIMIT
3396int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3397#endif
3398
3399static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3400{
3401#ifdef CONFIG_NET_FLOW_LIMIT
3402 struct sd_flow_limit *fl;
3403 struct softnet_data *sd;
3404 unsigned int old_flow, new_flow;
3405
3406 if (qlen < (netdev_max_backlog >> 1))
3407 return false;
3408
903ceff7 3409 sd = this_cpu_ptr(&softnet_data);
99bbc707
WB
3410
3411 rcu_read_lock();
3412 fl = rcu_dereference(sd->flow_limit);
3413 if (fl) {
3958afa1 3414 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
99bbc707
WB
3415 old_flow = fl->history[fl->history_head];
3416 fl->history[fl->history_head] = new_flow;
3417
3418 fl->history_head++;
3419 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3420
3421 if (likely(fl->buckets[old_flow]))
3422 fl->buckets[old_flow]--;
3423
3424 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3425 fl->count++;
3426 rcu_read_unlock();
3427 return true;
3428 }
3429 }
3430 rcu_read_unlock();
3431#endif
3432 return false;
3433}
3434
0a9627f2
TH
3435/*
3436 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3437 * queue (may be a remote CPU queue).
3438 */
fec5e652
TH
3439static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3440 unsigned int *qtail)
0a9627f2 3441{
e36fa2f7 3442 struct softnet_data *sd;
0a9627f2 3443 unsigned long flags;
99bbc707 3444 unsigned int qlen;
0a9627f2 3445
e36fa2f7 3446 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
3447
3448 local_irq_save(flags);
0a9627f2 3449
e36fa2f7 3450 rps_lock(sd);
e9e4dd32
JA
3451 if (!netif_running(skb->dev))
3452 goto drop;
99bbc707
WB
3453 qlen = skb_queue_len(&sd->input_pkt_queue);
3454 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
e008f3f0 3455 if (qlen) {
0a9627f2 3456enqueue:
e36fa2f7 3457 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 3458 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 3459 rps_unlock(sd);
152102c7 3460 local_irq_restore(flags);
0a9627f2
TH
3461 return NET_RX_SUCCESS;
3462 }
3463
ebda37c2
ED
3464 /* Schedule NAPI for backlog device
3465 * We can use non atomic operation since we own the queue lock
3466 */
3467 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 3468 if (!rps_ipi_queued(sd))
eecfd7c4 3469 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
3470 }
3471 goto enqueue;
3472 }
3473
e9e4dd32 3474drop:
dee42870 3475 sd->dropped++;
e36fa2f7 3476 rps_unlock(sd);
0a9627f2 3477
0a9627f2
TH
3478 local_irq_restore(flags);
3479
caf586e5 3480 atomic_long_inc(&skb->dev->rx_dropped);
0a9627f2
TH
3481 kfree_skb(skb);
3482 return NET_RX_DROP;
3483}
1da177e4 3484
ae78dbfa 3485static int netif_rx_internal(struct sk_buff *skb)
1da177e4 3486{
b0e28f1e 3487 int ret;
1da177e4 3488
588f0330 3489 net_timestamp_check(netdev_tstamp_prequeue, skb);
1da177e4 3490
cf66ba58 3491 trace_netif_rx(skb);
df334545 3492#ifdef CONFIG_RPS
c5905afb 3493 if (static_key_false(&rps_needed)) {
fec5e652 3494 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
3495 int cpu;
3496
cece1945 3497 preempt_disable();
b0e28f1e 3498 rcu_read_lock();
fec5e652
TH
3499
3500 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
3501 if (cpu < 0)
3502 cpu = smp_processor_id();
fec5e652
TH
3503
3504 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3505
b0e28f1e 3506 rcu_read_unlock();
cece1945 3507 preempt_enable();
adc9300e
ED
3508 } else
3509#endif
fec5e652
TH
3510 {
3511 unsigned int qtail;
3512 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3513 put_cpu();
3514 }
b0e28f1e 3515 return ret;
1da177e4 3516}
ae78dbfa
BH
3517
3518/**
3519 * netif_rx - post buffer to the network code
3520 * @skb: buffer to post
3521 *
3522 * This function receives a packet from a device driver and queues it for
3523 * the upper (protocol) levels to process. It always succeeds. The buffer
3524 * may be dropped during processing for congestion control or by the
3525 * protocol layers.
3526 *
3527 * return values:
3528 * NET_RX_SUCCESS (no congestion)
3529 * NET_RX_DROP (packet was dropped)
3530 *
3531 */
3532
3533int netif_rx(struct sk_buff *skb)
3534{
3535 trace_netif_rx_entry(skb);
3536
3537 return netif_rx_internal(skb);
3538}
d1b19dff 3539EXPORT_SYMBOL(netif_rx);
1da177e4
LT
3540
3541int netif_rx_ni(struct sk_buff *skb)
3542{
3543 int err;
3544
ae78dbfa
BH
3545 trace_netif_rx_ni_entry(skb);
3546
1da177e4 3547 preempt_disable();
ae78dbfa 3548 err = netif_rx_internal(skb);
1da177e4
LT
3549 if (local_softirq_pending())
3550 do_softirq();
3551 preempt_enable();
3552
3553 return err;
3554}
1da177e4
LT
3555EXPORT_SYMBOL(netif_rx_ni);
3556
1da177e4
LT
3557static void net_tx_action(struct softirq_action *h)
3558{
903ceff7 3559 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
1da177e4
LT
3560
3561 if (sd->completion_queue) {
3562 struct sk_buff *clist;
3563
3564 local_irq_disable();
3565 clist = sd->completion_queue;
3566 sd->completion_queue = NULL;
3567 local_irq_enable();
3568
3569 while (clist) {
3570 struct sk_buff *skb = clist;
3571 clist = clist->next;
3572
547b792c 3573 WARN_ON(atomic_read(&skb->users));
e6247027
ED
3574 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3575 trace_consume_skb(skb);
3576 else
3577 trace_kfree_skb(skb, net_tx_action);
1da177e4
LT
3578 __kfree_skb(skb);
3579 }
3580 }
3581
3582 if (sd->output_queue) {
37437bb2 3583 struct Qdisc *head;
1da177e4
LT
3584
3585 local_irq_disable();
3586 head = sd->output_queue;
3587 sd->output_queue = NULL;
a9cbd588 3588 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
3589 local_irq_enable();
3590
3591 while (head) {
37437bb2
DM
3592 struct Qdisc *q = head;
3593 spinlock_t *root_lock;
3594
1da177e4
LT
3595 head = head->next_sched;
3596
5fb66229 3597 root_lock = qdisc_lock(q);
37437bb2 3598 if (spin_trylock(root_lock)) {
4e857c58 3599 smp_mb__before_atomic();
def82a1d
JP
3600 clear_bit(__QDISC_STATE_SCHED,
3601 &q->state);
37437bb2
DM
3602 qdisc_run(q);
3603 spin_unlock(root_lock);
1da177e4 3604 } else {
195648bb 3605 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 3606 &q->state)) {
195648bb 3607 __netif_reschedule(q);
e8a83e10 3608 } else {
4e857c58 3609 smp_mb__before_atomic();
e8a83e10
JP
3610 clear_bit(__QDISC_STATE_SCHED,
3611 &q->state);
3612 }
1da177e4
LT
3613 }
3614 }
3615 }
3616}
3617
ab95bfe0
JP
3618#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3619 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
da678292
MM
3620/* This hook is defined here for ATM LANE */
3621int (*br_fdb_test_addr_hook)(struct net_device *dev,
3622 unsigned char *addr) __read_mostly;
4fb019a0 3623EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 3624#endif
1da177e4 3625
f697c3e8
HX
3626static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3627 struct packet_type **pt_prev,
3628 int *ret, struct net_device *orig_dev)
3629{
e7582bab 3630#ifdef CONFIG_NET_CLS_ACT
d2788d34
DB
3631 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3632 struct tcf_result cl_res;
24824a09 3633
c9e99fd0
DB
3634 /* If there's at least one ingress present somewhere (so
3635 * we get here via enabled static key), remaining devices
3636 * that are not configured with an ingress qdisc will bail
d2788d34 3637 * out here.
c9e99fd0 3638 */
d2788d34 3639 if (!cl)
4577139b 3640 return skb;
f697c3e8
HX
3641 if (*pt_prev) {
3642 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3643 *pt_prev = NULL;
1da177e4
LT
3644 }
3645
3365495c 3646 qdisc_skb_cb(skb)->pkt_len = skb->len;
c9e99fd0 3647 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
24ea591d 3648 qdisc_bstats_cpu_update(cl->q, skb);
c9e99fd0 3649
d2788d34
DB
3650 switch (tc_classify(skb, cl, &cl_res)) {
3651 case TC_ACT_OK:
3652 case TC_ACT_RECLASSIFY:
3653 skb->tc_index = TC_H_MIN(cl_res.classid);
3654 break;
3655 case TC_ACT_SHOT:
24ea591d 3656 qdisc_qstats_cpu_drop(cl->q);
d2788d34
DB
3657 case TC_ACT_STOLEN:
3658 case TC_ACT_QUEUED:
3659 kfree_skb(skb);
3660 return NULL;
3661 default:
3662 break;
f697c3e8 3663 }
e7582bab 3664#endif /* CONFIG_NET_CLS_ACT */
e687ad60
PN
3665 return skb;
3666}
1da177e4 3667
ab95bfe0
JP
3668/**
3669 * netdev_rx_handler_register - register receive handler
3670 * @dev: device to register a handler for
3671 * @rx_handler: receive handler to register
93e2c32b 3672 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0 3673 *
e227867f 3674 * Register a receive handler for a device. This handler will then be
ab95bfe0
JP
3675 * called from __netif_receive_skb. A negative errno code is returned
3676 * on a failure.
3677 *
3678 * The caller must hold the rtnl_mutex.
8a4eb573
JP
3679 *
3680 * For a general description of rx_handler, see enum rx_handler_result.
ab95bfe0
JP
3681 */
3682int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
3683 rx_handler_func_t *rx_handler,
3684 void *rx_handler_data)
ab95bfe0
JP
3685{
3686 ASSERT_RTNL();
3687
3688 if (dev->rx_handler)
3689 return -EBUSY;
3690
00cfec37 3691 /* Note: rx_handler_data must be set before rx_handler */
93e2c32b 3692 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
3693 rcu_assign_pointer(dev->rx_handler, rx_handler);
3694
3695 return 0;
3696}
3697EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3698
3699/**
3700 * netdev_rx_handler_unregister - unregister receive handler
3701 * @dev: device to unregister a handler from
3702 *
166ec369 3703 * Unregister a receive handler from a device.
ab95bfe0
JP
3704 *
3705 * The caller must hold the rtnl_mutex.
3706 */
3707void netdev_rx_handler_unregister(struct net_device *dev)
3708{
3709
3710 ASSERT_RTNL();
a9b3cd7f 3711 RCU_INIT_POINTER(dev->rx_handler, NULL);
00cfec37
ED
3712 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3713 * section has a guarantee to see a non NULL rx_handler_data
3714 * as well.
3715 */
3716 synchronize_net();
a9b3cd7f 3717 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
ab95bfe0
JP
3718}
3719EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3720
b4b9e355
MG
3721/*
3722 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3723 * the special handling of PFMEMALLOC skbs.
3724 */
3725static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3726{
3727 switch (skb->protocol) {
2b8837ae
JP
3728 case htons(ETH_P_ARP):
3729 case htons(ETH_P_IP):
3730 case htons(ETH_P_IPV6):
3731 case htons(ETH_P_8021Q):
3732 case htons(ETH_P_8021AD):
b4b9e355
MG
3733 return true;
3734 default:
3735 return false;
3736 }
3737}
3738
e687ad60
PN
3739static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3740 int *ret, struct net_device *orig_dev)
3741{
e7582bab 3742#ifdef CONFIG_NETFILTER_INGRESS
e687ad60
PN
3743 if (nf_hook_ingress_active(skb)) {
3744 if (*pt_prev) {
3745 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3746 *pt_prev = NULL;
3747 }
3748
3749 return nf_hook_ingress(skb);
3750 }
e7582bab 3751#endif /* CONFIG_NETFILTER_INGRESS */
e687ad60
PN
3752 return 0;
3753}
e687ad60 3754
9754e293 3755static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
1da177e4
LT
3756{
3757 struct packet_type *ptype, *pt_prev;
ab95bfe0 3758 rx_handler_func_t *rx_handler;
f2ccd8fa 3759 struct net_device *orig_dev;
8a4eb573 3760 bool deliver_exact = false;
1da177e4 3761 int ret = NET_RX_DROP;
252e3346 3762 __be16 type;
1da177e4 3763
588f0330 3764 net_timestamp_check(!netdev_tstamp_prequeue, skb);
81bbb3d4 3765
cf66ba58 3766 trace_netif_receive_skb(skb);
9b22ea56 3767
cc9bd5ce 3768 orig_dev = skb->dev;
8f903c70 3769
c1d2bbe1 3770 skb_reset_network_header(skb);
fda55eca
ED
3771 if (!skb_transport_header_was_set(skb))
3772 skb_reset_transport_header(skb);
0b5c9db1 3773 skb_reset_mac_len(skb);
1da177e4
LT
3774
3775 pt_prev = NULL;
3776
63d8ea7f 3777another_round:
b6858177 3778 skb->skb_iif = skb->dev->ifindex;
63d8ea7f
DM
3779
3780 __this_cpu_inc(softnet_data.processed);
3781
8ad227ff
PM
3782 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3783 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
0d5501c1 3784 skb = skb_vlan_untag(skb);
bcc6d479 3785 if (unlikely(!skb))
2c17d27c 3786 goto out;
bcc6d479
JP
3787 }
3788
1da177e4
LT
3789#ifdef CONFIG_NET_CLS_ACT
3790 if (skb->tc_verd & TC_NCLS) {
3791 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3792 goto ncls;
3793 }
3794#endif
3795
9754e293 3796 if (pfmemalloc)
b4b9e355
MG
3797 goto skip_taps;
3798
1da177e4 3799 list_for_each_entry_rcu(ptype, &ptype_all, list) {
7866a621
SN
3800 if (pt_prev)
3801 ret = deliver_skb(skb, pt_prev, orig_dev);
3802 pt_prev = ptype;
3803 }
3804
3805 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3806 if (pt_prev)
3807 ret = deliver_skb(skb, pt_prev, orig_dev);
3808 pt_prev = ptype;
1da177e4
LT
3809 }
3810
b4b9e355 3811skip_taps:
1cf51900 3812#ifdef CONFIG_NET_INGRESS
4577139b
DB
3813 if (static_key_false(&ingress_needed)) {
3814 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3815 if (!skb)
2c17d27c 3816 goto out;
e687ad60
PN
3817
3818 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
2c17d27c 3819 goto out;
4577139b 3820 }
1cf51900
PN
3821#endif
3822#ifdef CONFIG_NET_CLS_ACT
4577139b 3823 skb->tc_verd = 0;
1da177e4
LT
3824ncls:
3825#endif
9754e293 3826 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
b4b9e355
MG
3827 goto drop;
3828
df8a39de 3829 if (skb_vlan_tag_present(skb)) {
2425717b
JF
3830 if (pt_prev) {
3831 ret = deliver_skb(skb, pt_prev, orig_dev);
3832 pt_prev = NULL;
3833 }
48cc32d3 3834 if (vlan_do_receive(&skb))
2425717b
JF
3835 goto another_round;
3836 else if (unlikely(!skb))
2c17d27c 3837 goto out;
2425717b
JF
3838 }
3839
48cc32d3 3840 rx_handler = rcu_dereference(skb->dev->rx_handler);
ab95bfe0
JP
3841 if (rx_handler) {
3842 if (pt_prev) {
3843 ret = deliver_skb(skb, pt_prev, orig_dev);
3844 pt_prev = NULL;
3845 }
8a4eb573
JP
3846 switch (rx_handler(&skb)) {
3847 case RX_HANDLER_CONSUMED:
3bc1b1ad 3848 ret = NET_RX_SUCCESS;
2c17d27c 3849 goto out;
8a4eb573 3850 case RX_HANDLER_ANOTHER:
63d8ea7f 3851 goto another_round;
8a4eb573
JP
3852 case RX_HANDLER_EXACT:
3853 deliver_exact = true;
3854 case RX_HANDLER_PASS:
3855 break;
3856 default:
3857 BUG();
3858 }
ab95bfe0 3859 }
1da177e4 3860
df8a39de
JP
3861 if (unlikely(skb_vlan_tag_present(skb))) {
3862 if (skb_vlan_tag_get_id(skb))
d4b812de
ED
3863 skb->pkt_type = PACKET_OTHERHOST;
3864 /* Note: we might in the future use prio bits
3865 * and set skb->priority like in vlan_do_receive()
3866 * For the time being, just ignore Priority Code Point
3867 */
3868 skb->vlan_tci = 0;
3869 }
48cc32d3 3870
7866a621
SN
3871 type = skb->protocol;
3872
63d8ea7f 3873 /* deliver only exact match when indicated */
7866a621
SN
3874 if (likely(!deliver_exact)) {
3875 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3876 &ptype_base[ntohs(type) &
3877 PTYPE_HASH_MASK]);
3878 }
1f3c8804 3879
7866a621
SN
3880 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3881 &orig_dev->ptype_specific);
3882
3883 if (unlikely(skb->dev != orig_dev)) {
3884 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3885 &skb->dev->ptype_specific);
1da177e4
LT
3886 }
3887
3888 if (pt_prev) {
1080e512 3889 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
0e698bf6 3890 goto drop;
1080e512
MT
3891 else
3892 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4 3893 } else {
b4b9e355 3894drop:
caf586e5 3895 atomic_long_inc(&skb->dev->rx_dropped);
1da177e4
LT
3896 kfree_skb(skb);
3897 /* Jamal, now you will not able to escape explaining
3898 * me how you were going to use this. :-)
3899 */
3900 ret = NET_RX_DROP;
3901 }
3902
2c17d27c 3903out:
9754e293
DM
3904 return ret;
3905}
3906
3907static int __netif_receive_skb(struct sk_buff *skb)
3908{
3909 int ret;
3910
3911 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3912 unsigned long pflags = current->flags;
3913
3914 /*
3915 * PFMEMALLOC skbs are special, they should
3916 * - be delivered to SOCK_MEMALLOC sockets only
3917 * - stay away from userspace
3918 * - have bounded memory usage
3919 *
3920 * Use PF_MEMALLOC as this saves us from propagating the allocation
3921 * context down to all allocation sites.
3922 */
3923 current->flags |= PF_MEMALLOC;
3924 ret = __netif_receive_skb_core(skb, true);
3925 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3926 } else
3927 ret = __netif_receive_skb_core(skb, false);
3928
1da177e4
LT
3929 return ret;
3930}
0a9627f2 3931
ae78dbfa 3932static int netif_receive_skb_internal(struct sk_buff *skb)
0a9627f2 3933{
2c17d27c
JA
3934 int ret;
3935
588f0330 3936 net_timestamp_check(netdev_tstamp_prequeue, skb);
3b098e2d 3937
c1f19b51
RC
3938 if (skb_defer_rx_timestamp(skb))
3939 return NET_RX_SUCCESS;
3940
2c17d27c
JA
3941 rcu_read_lock();
3942
df334545 3943#ifdef CONFIG_RPS
c5905afb 3944 if (static_key_false(&rps_needed)) {
3b098e2d 3945 struct rps_dev_flow voidflow, *rflow = &voidflow;
2c17d27c 3946 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 3947
3b098e2d
ED
3948 if (cpu >= 0) {
3949 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3950 rcu_read_unlock();
adc9300e 3951 return ret;
3b098e2d 3952 }
fec5e652 3953 }
1e94d72f 3954#endif
2c17d27c
JA
3955 ret = __netif_receive_skb(skb);
3956 rcu_read_unlock();
3957 return ret;
0a9627f2 3958}
ae78dbfa
BH
3959
3960/**
3961 * netif_receive_skb - process receive buffer from network
3962 * @skb: buffer to process
3963 *
3964 * netif_receive_skb() is the main receive data processing function.
3965 * It always succeeds. The buffer may be dropped during processing
3966 * for congestion control or by the protocol layers.
3967 *
3968 * This function may only be called from softirq context and interrupts
3969 * should be enabled.
3970 *
3971 * Return values (usually ignored):
3972 * NET_RX_SUCCESS: no congestion
3973 * NET_RX_DROP: packet was dropped
3974 */
7026b1dd 3975int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
ae78dbfa
BH
3976{
3977 trace_netif_receive_skb_entry(skb);
3978
3979 return netif_receive_skb_internal(skb);
3980}
7026b1dd 3981EXPORT_SYMBOL(netif_receive_skb_sk);
1da177e4 3982
88751275
ED
3983/* Network device is going away, flush any packets still pending
3984 * Called with irqs disabled.
3985 */
152102c7 3986static void flush_backlog(void *arg)
6e583ce5 3987{
152102c7 3988 struct net_device *dev = arg;
903ceff7 3989 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6e583ce5
SH
3990 struct sk_buff *skb, *tmp;
3991
e36fa2f7 3992 rps_lock(sd);
6e7676c1 3993 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6e583ce5 3994 if (skb->dev == dev) {
e36fa2f7 3995 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 3996 kfree_skb(skb);
76cc8b13 3997 input_queue_head_incr(sd);
6e583ce5 3998 }
6e7676c1 3999 }
e36fa2f7 4000 rps_unlock(sd);
6e7676c1
CG
4001
4002 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4003 if (skb->dev == dev) {
4004 __skb_unlink(skb, &sd->process_queue);
4005 kfree_skb(skb);
76cc8b13 4006 input_queue_head_incr(sd);
6e7676c1
CG
4007 }
4008 }
6e583ce5
SH
4009}
4010
d565b0a1
HX
4011static int napi_gro_complete(struct sk_buff *skb)
4012{
22061d80 4013 struct packet_offload *ptype;
d565b0a1 4014 __be16 type = skb->protocol;
22061d80 4015 struct list_head *head = &offload_base;
d565b0a1
HX
4016 int err = -ENOENT;
4017
c3c7c254
ED
4018 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4019
fc59f9a3
HX
4020 if (NAPI_GRO_CB(skb)->count == 1) {
4021 skb_shinfo(skb)->gso_size = 0;
d565b0a1 4022 goto out;
fc59f9a3 4023 }
d565b0a1
HX
4024
4025 rcu_read_lock();
4026 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 4027 if (ptype->type != type || !ptype->callbacks.gro_complete)
d565b0a1
HX
4028 continue;
4029
299603e8 4030 err = ptype->callbacks.gro_complete(skb, 0);
d565b0a1
HX
4031 break;
4032 }
4033 rcu_read_unlock();
4034
4035 if (err) {
4036 WARN_ON(&ptype->list == head);
4037 kfree_skb(skb);
4038 return NET_RX_SUCCESS;
4039 }
4040
4041out:
ae78dbfa 4042 return netif_receive_skb_internal(skb);
d565b0a1
HX
4043}
4044
2e71a6f8
ED
4045/* napi->gro_list contains packets ordered by age.
4046 * youngest packets at the head of it.
4047 * Complete skbs in reverse order to reduce latencies.
4048 */
4049void napi_gro_flush(struct napi_struct *napi, bool flush_old)
d565b0a1 4050{
2e71a6f8 4051 struct sk_buff *skb, *prev = NULL;
d565b0a1 4052
2e71a6f8
ED
4053 /* scan list and build reverse chain */
4054 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4055 skb->prev = prev;
4056 prev = skb;
4057 }
4058
4059 for (skb = prev; skb; skb = prev) {
d565b0a1 4060 skb->next = NULL;
2e71a6f8
ED
4061
4062 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4063 return;
4064
4065 prev = skb->prev;
d565b0a1 4066 napi_gro_complete(skb);
2e71a6f8 4067 napi->gro_count--;
d565b0a1
HX
4068 }
4069
4070 napi->gro_list = NULL;
4071}
86cac58b 4072EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 4073
89c5fa33
ED
4074static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4075{
4076 struct sk_buff *p;
4077 unsigned int maclen = skb->dev->hard_header_len;
0b4cec8c 4078 u32 hash = skb_get_hash_raw(skb);
89c5fa33
ED
4079
4080 for (p = napi->gro_list; p; p = p->next) {
4081 unsigned long diffs;
4082
0b4cec8c
TH
4083 NAPI_GRO_CB(p)->flush = 0;
4084
4085 if (hash != skb_get_hash_raw(p)) {
4086 NAPI_GRO_CB(p)->same_flow = 0;
4087 continue;
4088 }
4089
89c5fa33
ED
4090 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4091 diffs |= p->vlan_tci ^ skb->vlan_tci;
4092 if (maclen == ETH_HLEN)
4093 diffs |= compare_ether_header(skb_mac_header(p),
a50e233c 4094 skb_mac_header(skb));
89c5fa33
ED
4095 else if (!diffs)
4096 diffs = memcmp(skb_mac_header(p),
a50e233c 4097 skb_mac_header(skb),
89c5fa33
ED
4098 maclen);
4099 NAPI_GRO_CB(p)->same_flow = !diffs;
89c5fa33
ED
4100 }
4101}
4102
299603e8
JC
4103static void skb_gro_reset_offset(struct sk_buff *skb)
4104{
4105 const struct skb_shared_info *pinfo = skb_shinfo(skb);
4106 const skb_frag_t *frag0 = &pinfo->frags[0];
4107
4108 NAPI_GRO_CB(skb)->data_offset = 0;
4109 NAPI_GRO_CB(skb)->frag0 = NULL;
4110 NAPI_GRO_CB(skb)->frag0_len = 0;
4111
4112 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4113 pinfo->nr_frags &&
4114 !PageHighMem(skb_frag_page(frag0))) {
4115 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4116 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
89c5fa33
ED
4117 }
4118}
4119
a50e233c
ED
4120static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4121{
4122 struct skb_shared_info *pinfo = skb_shinfo(skb);
4123
4124 BUG_ON(skb->end - skb->tail < grow);
4125
4126 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4127
4128 skb->data_len -= grow;
4129 skb->tail += grow;
4130
4131 pinfo->frags[0].page_offset += grow;
4132 skb_frag_size_sub(&pinfo->frags[0], grow);
4133
4134 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4135 skb_frag_unref(skb, 0);
4136 memmove(pinfo->frags, pinfo->frags + 1,
4137 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4138 }
4139}
4140
bb728820 4141static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
4142{
4143 struct sk_buff **pp = NULL;
22061d80 4144 struct packet_offload *ptype;
d565b0a1 4145 __be16 type = skb->protocol;
22061d80 4146 struct list_head *head = &offload_base;
0da2afd5 4147 int same_flow;
5b252f0c 4148 enum gro_result ret;
a50e233c 4149 int grow;
d565b0a1 4150
9c62a68d 4151 if (!(skb->dev->features & NETIF_F_GRO))
d565b0a1
HX
4152 goto normal;
4153
5a212329 4154 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
f17f5c91
HX
4155 goto normal;
4156
89c5fa33
ED
4157 gro_list_prepare(napi, skb);
4158
d565b0a1
HX
4159 rcu_read_lock();
4160 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 4161 if (ptype->type != type || !ptype->callbacks.gro_receive)
d565b0a1
HX
4162 continue;
4163
86911732 4164 skb_set_network_header(skb, skb_gro_offset(skb));
efd9450e 4165 skb_reset_mac_len(skb);
d565b0a1
HX
4166 NAPI_GRO_CB(skb)->same_flow = 0;
4167 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 4168 NAPI_GRO_CB(skb)->free = 0;
b582ef09 4169 NAPI_GRO_CB(skb)->udp_mark = 0;
15e2396d 4170 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
d565b0a1 4171
662880f4
TH
4172 /* Setup for GRO checksum validation */
4173 switch (skb->ip_summed) {
4174 case CHECKSUM_COMPLETE:
4175 NAPI_GRO_CB(skb)->csum = skb->csum;
4176 NAPI_GRO_CB(skb)->csum_valid = 1;
4177 NAPI_GRO_CB(skb)->csum_cnt = 0;
4178 break;
4179 case CHECKSUM_UNNECESSARY:
4180 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4181 NAPI_GRO_CB(skb)->csum_valid = 0;
4182 break;
4183 default:
4184 NAPI_GRO_CB(skb)->csum_cnt = 0;
4185 NAPI_GRO_CB(skb)->csum_valid = 0;
4186 }
d565b0a1 4187
f191a1d1 4188 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
d565b0a1
HX
4189 break;
4190 }
4191 rcu_read_unlock();
4192
4193 if (&ptype->list == head)
4194 goto normal;
4195
0da2afd5 4196 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 4197 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 4198
d565b0a1
HX
4199 if (pp) {
4200 struct sk_buff *nskb = *pp;
4201
4202 *pp = nskb->next;
4203 nskb->next = NULL;
4204 napi_gro_complete(nskb);
4ae5544f 4205 napi->gro_count--;
d565b0a1
HX
4206 }
4207
0da2afd5 4208 if (same_flow)
d565b0a1
HX
4209 goto ok;
4210
600adc18 4211 if (NAPI_GRO_CB(skb)->flush)
d565b0a1 4212 goto normal;
d565b0a1 4213
600adc18
ED
4214 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4215 struct sk_buff *nskb = napi->gro_list;
4216
4217 /* locate the end of the list to select the 'oldest' flow */
4218 while (nskb->next) {
4219 pp = &nskb->next;
4220 nskb = *pp;
4221 }
4222 *pp = NULL;
4223 nskb->next = NULL;
4224 napi_gro_complete(nskb);
4225 } else {
4226 napi->gro_count++;
4227 }
d565b0a1 4228 NAPI_GRO_CB(skb)->count = 1;
2e71a6f8 4229 NAPI_GRO_CB(skb)->age = jiffies;
29e98242 4230 NAPI_GRO_CB(skb)->last = skb;
86911732 4231 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
4232 skb->next = napi->gro_list;
4233 napi->gro_list = skb;
5d0d9be8 4234 ret = GRO_HELD;
d565b0a1 4235
ad0f9904 4236pull:
a50e233c
ED
4237 grow = skb_gro_offset(skb) - skb_headlen(skb);
4238 if (grow > 0)
4239 gro_pull_from_frag0(skb, grow);
d565b0a1 4240ok:
5d0d9be8 4241 return ret;
d565b0a1
HX
4242
4243normal:
ad0f9904
HX
4244 ret = GRO_NORMAL;
4245 goto pull;
5d38a079 4246}
96e93eab 4247
bf5a755f
JC
4248struct packet_offload *gro_find_receive_by_type(__be16 type)
4249{
4250 struct list_head *offload_head = &offload_base;
4251 struct packet_offload *ptype;
4252
4253 list_for_each_entry_rcu(ptype, offload_head, list) {
4254 if (ptype->type != type || !ptype->callbacks.gro_receive)
4255 continue;
4256 return ptype;
4257 }
4258 return NULL;
4259}
e27a2f83 4260EXPORT_SYMBOL(gro_find_receive_by_type);
bf5a755f
JC
4261
4262struct packet_offload *gro_find_complete_by_type(__be16 type)
4263{
4264 struct list_head *offload_head = &offload_base;
4265 struct packet_offload *ptype;
4266
4267 list_for_each_entry_rcu(ptype, offload_head, list) {
4268 if (ptype->type != type || !ptype->callbacks.gro_complete)
4269 continue;
4270 return ptype;
4271 }
4272 return NULL;
4273}
e27a2f83 4274EXPORT_SYMBOL(gro_find_complete_by_type);
5d38a079 4275
bb728820 4276static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 4277{
5d0d9be8
HX
4278 switch (ret) {
4279 case GRO_NORMAL:
ae78dbfa 4280 if (netif_receive_skb_internal(skb))
c7c4b3b6
BH
4281 ret = GRO_DROP;
4282 break;
5d38a079 4283
5d0d9be8 4284 case GRO_DROP:
5d38a079
HX
4285 kfree_skb(skb);
4286 break;
5b252f0c 4287
daa86548 4288 case GRO_MERGED_FREE:
d7e8883c
ED
4289 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4290 kmem_cache_free(skbuff_head_cache, skb);
4291 else
4292 __kfree_skb(skb);
daa86548
ED
4293 break;
4294
5b252f0c
BH
4295 case GRO_HELD:
4296 case GRO_MERGED:
4297 break;
5d38a079
HX
4298 }
4299
c7c4b3b6 4300 return ret;
5d0d9be8 4301}
5d0d9be8 4302
c7c4b3b6 4303gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 4304{
ae78dbfa 4305 trace_napi_gro_receive_entry(skb);
86911732 4306
a50e233c
ED
4307 skb_gro_reset_offset(skb);
4308
89c5fa33 4309 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
d565b0a1
HX
4310}
4311EXPORT_SYMBOL(napi_gro_receive);
4312
d0c2b0d2 4313static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
96e93eab 4314{
93a35f59
ED
4315 if (unlikely(skb->pfmemalloc)) {
4316 consume_skb(skb);
4317 return;
4318 }
96e93eab 4319 __skb_pull(skb, skb_headlen(skb));
2a2a459e
ED
4320 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4321 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3701e513 4322 skb->vlan_tci = 0;
66c46d74 4323 skb->dev = napi->dev;
6d152e23 4324 skb->skb_iif = 0;
c3caf119
JC
4325 skb->encapsulation = 0;
4326 skb_shinfo(skb)->gso_type = 0;
e33d0ba8 4327 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
96e93eab
HX
4328
4329 napi->skb = skb;
4330}
96e93eab 4331
76620aaf 4332struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 4333{
5d38a079 4334 struct sk_buff *skb = napi->skb;
5d38a079
HX
4335
4336 if (!skb) {
fd11a83d 4337 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
84b9cd63 4338 napi->skb = skb;
80595d59 4339 }
96e93eab
HX
4340 return skb;
4341}
76620aaf 4342EXPORT_SYMBOL(napi_get_frags);
96e93eab 4343
a50e233c
ED
4344static gro_result_t napi_frags_finish(struct napi_struct *napi,
4345 struct sk_buff *skb,
4346 gro_result_t ret)
96e93eab 4347{
5d0d9be8
HX
4348 switch (ret) {
4349 case GRO_NORMAL:
a50e233c
ED
4350 case GRO_HELD:
4351 __skb_push(skb, ETH_HLEN);
4352 skb->protocol = eth_type_trans(skb, skb->dev);
4353 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
c7c4b3b6 4354 ret = GRO_DROP;
86911732 4355 break;
5d38a079 4356
5d0d9be8 4357 case GRO_DROP:
5d0d9be8
HX
4358 case GRO_MERGED_FREE:
4359 napi_reuse_skb(napi, skb);
4360 break;
5b252f0c
BH
4361
4362 case GRO_MERGED:
4363 break;
5d0d9be8 4364 }
5d38a079 4365
c7c4b3b6 4366 return ret;
5d38a079 4367}
5d0d9be8 4368
a50e233c
ED
4369/* Upper GRO stack assumes network header starts at gro_offset=0
4370 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4371 * We copy ethernet header into skb->data to have a common layout.
4372 */
4adb9c4a 4373static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
76620aaf
HX
4374{
4375 struct sk_buff *skb = napi->skb;
a50e233c
ED
4376 const struct ethhdr *eth;
4377 unsigned int hlen = sizeof(*eth);
76620aaf
HX
4378
4379 napi->skb = NULL;
4380
a50e233c
ED
4381 skb_reset_mac_header(skb);
4382 skb_gro_reset_offset(skb);
4383
4384 eth = skb_gro_header_fast(skb, 0);
4385 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4386 eth = skb_gro_header_slow(skb, hlen, 0);
4387 if (unlikely(!eth)) {
4388 napi_reuse_skb(napi, skb);
4389 return NULL;
4390 }
4391 } else {
4392 gro_pull_from_frag0(skb, hlen);
4393 NAPI_GRO_CB(skb)->frag0 += hlen;
4394 NAPI_GRO_CB(skb)->frag0_len -= hlen;
76620aaf 4395 }
a50e233c
ED
4396 __skb_pull(skb, hlen);
4397
4398 /*
4399 * This works because the only protocols we care about don't require
4400 * special handling.
4401 * We'll fix it up properly in napi_frags_finish()
4402 */
4403 skb->protocol = eth->h_proto;
76620aaf 4404
76620aaf
HX
4405 return skb;
4406}
76620aaf 4407
c7c4b3b6 4408gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 4409{
76620aaf 4410 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
4411
4412 if (!skb)
c7c4b3b6 4413 return GRO_DROP;
5d0d9be8 4414
ae78dbfa
BH
4415 trace_napi_gro_frags_entry(skb);
4416
89c5fa33 4417 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5d0d9be8 4418}
5d38a079
HX
4419EXPORT_SYMBOL(napi_gro_frags);
4420
573e8fca
TH
4421/* Compute the checksum from gro_offset and return the folded value
4422 * after adding in any pseudo checksum.
4423 */
4424__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4425{
4426 __wsum wsum;
4427 __sum16 sum;
4428
4429 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4430
4431 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4432 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4433 if (likely(!sum)) {
4434 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4435 !skb->csum_complete_sw)
4436 netdev_rx_csum_fault(skb->dev);
4437 }
4438
4439 NAPI_GRO_CB(skb)->csum = wsum;
4440 NAPI_GRO_CB(skb)->csum_valid = 1;
4441
4442 return sum;
4443}
4444EXPORT_SYMBOL(__skb_gro_checksum_complete);
4445
e326bed2 4446/*
855abcf0 4447 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
e326bed2
ED
4448 * Note: called with local irq disabled, but exits with local irq enabled.
4449 */
4450static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4451{
4452#ifdef CONFIG_RPS
4453 struct softnet_data *remsd = sd->rps_ipi_list;
4454
4455 if (remsd) {
4456 sd->rps_ipi_list = NULL;
4457
4458 local_irq_enable();
4459
4460 /* Send pending IPI's to kick RPS processing on remote cpus. */
4461 while (remsd) {
4462 struct softnet_data *next = remsd->rps_ipi_next;
4463
4464 if (cpu_online(remsd->cpu))
c46fff2a 4465 smp_call_function_single_async(remsd->cpu,
fce8ad15 4466 &remsd->csd);
e326bed2
ED
4467 remsd = next;
4468 }
4469 } else
4470#endif
4471 local_irq_enable();
4472}
4473
d75b1ade
ED
4474static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4475{
4476#ifdef CONFIG_RPS
4477 return sd->rps_ipi_list != NULL;
4478#else
4479 return false;
4480#endif
4481}
4482
bea3348e 4483static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
4484{
4485 int work = 0;
eecfd7c4 4486 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
1da177e4 4487
e326bed2
ED
4488 /* Check if we have pending ipi, its better to send them now,
4489 * not waiting net_rx_action() end.
4490 */
d75b1ade 4491 if (sd_has_rps_ipi_waiting(sd)) {
e326bed2
ED
4492 local_irq_disable();
4493 net_rps_action_and_irq_enable(sd);
4494 }
d75b1ade 4495
bea3348e 4496 napi->weight = weight_p;
6e7676c1 4497 local_irq_disable();
11ef7a89 4498 while (1) {
1da177e4 4499 struct sk_buff *skb;
6e7676c1
CG
4500
4501 while ((skb = __skb_dequeue(&sd->process_queue))) {
2c17d27c 4502 rcu_read_lock();
6e7676c1
CG
4503 local_irq_enable();
4504 __netif_receive_skb(skb);
2c17d27c 4505 rcu_read_unlock();
6e7676c1 4506 local_irq_disable();
76cc8b13
TH
4507 input_queue_head_incr(sd);
4508 if (++work >= quota) {
4509 local_irq_enable();
4510 return work;
4511 }
6e7676c1 4512 }
1da177e4 4513
e36fa2f7 4514 rps_lock(sd);
11ef7a89 4515 if (skb_queue_empty(&sd->input_pkt_queue)) {
eecfd7c4
ED
4516 /*
4517 * Inline a custom version of __napi_complete().
4518 * only current cpu owns and manipulates this napi,
11ef7a89
TH
4519 * and NAPI_STATE_SCHED is the only possible flag set
4520 * on backlog.
4521 * We can use a plain write instead of clear_bit(),
eecfd7c4
ED
4522 * and we dont need an smp_mb() memory barrier.
4523 */
eecfd7c4 4524 napi->state = 0;
11ef7a89 4525 rps_unlock(sd);
eecfd7c4 4526
11ef7a89 4527 break;
bea3348e 4528 }
11ef7a89
TH
4529
4530 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4531 &sd->process_queue);
e36fa2f7 4532 rps_unlock(sd);
6e7676c1
CG
4533 }
4534 local_irq_enable();
1da177e4 4535
bea3348e
SH
4536 return work;
4537}
1da177e4 4538
bea3348e
SH
4539/**
4540 * __napi_schedule - schedule for receive
c4ea43c5 4541 * @n: entry to schedule
bea3348e 4542 *
bc9ad166
ED
4543 * The entry's receive function will be scheduled to run.
4544 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
bea3348e 4545 */
b5606c2d 4546void __napi_schedule(struct napi_struct *n)
bea3348e
SH
4547{
4548 unsigned long flags;
1da177e4 4549
bea3348e 4550 local_irq_save(flags);
903ceff7 4551 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
bea3348e 4552 local_irq_restore(flags);
1da177e4 4553}
bea3348e
SH
4554EXPORT_SYMBOL(__napi_schedule);
4555
bc9ad166
ED
4556/**
4557 * __napi_schedule_irqoff - schedule for receive
4558 * @n: entry to schedule
4559 *
4560 * Variant of __napi_schedule() assuming hard irqs are masked
4561 */
4562void __napi_schedule_irqoff(struct napi_struct *n)
4563{
4564 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4565}
4566EXPORT_SYMBOL(__napi_schedule_irqoff);
4567
d565b0a1
HX
4568void __napi_complete(struct napi_struct *n)
4569{
4570 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
d565b0a1 4571
d75b1ade 4572 list_del_init(&n->poll_list);
4e857c58 4573 smp_mb__before_atomic();
d565b0a1
HX
4574 clear_bit(NAPI_STATE_SCHED, &n->state);
4575}
4576EXPORT_SYMBOL(__napi_complete);
4577
3b47d303 4578void napi_complete_done(struct napi_struct *n, int work_done)
d565b0a1
HX
4579{
4580 unsigned long flags;
4581
4582 /*
4583 * don't let napi dequeue from the cpu poll list
4584 * just in case its running on a different cpu
4585 */
4586 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4587 return;
4588
3b47d303
ED
4589 if (n->gro_list) {
4590 unsigned long timeout = 0;
d75b1ade 4591
3b47d303
ED
4592 if (work_done)
4593 timeout = n->dev->gro_flush_timeout;
4594
4595 if (timeout)
4596 hrtimer_start(&n->timer, ns_to_ktime(timeout),
4597 HRTIMER_MODE_REL_PINNED);
4598 else
4599 napi_gro_flush(n, false);
4600 }
d75b1ade
ED
4601 if (likely(list_empty(&n->poll_list))) {
4602 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4603 } else {
4604 /* If n->poll_list is not empty, we need to mask irqs */
4605 local_irq_save(flags);
4606 __napi_complete(n);
4607 local_irq_restore(flags);
4608 }
d565b0a1 4609}
3b47d303 4610EXPORT_SYMBOL(napi_complete_done);
d565b0a1 4611
af12fa6e
ET
4612/* must be called under rcu_read_lock(), as we dont take a reference */
4613struct napi_struct *napi_by_id(unsigned int napi_id)
4614{
4615 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4616 struct napi_struct *napi;
4617
4618 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4619 if (napi->napi_id == napi_id)
4620 return napi;
4621
4622 return NULL;
4623}
4624EXPORT_SYMBOL_GPL(napi_by_id);
4625
4626void napi_hash_add(struct napi_struct *napi)
4627{
4628 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4629
4630 spin_lock(&napi_hash_lock);
4631
4632 /* 0 is not a valid id, we also skip an id that is taken
4633 * we expect both events to be extremely rare
4634 */
4635 napi->napi_id = 0;
4636 while (!napi->napi_id) {
4637 napi->napi_id = ++napi_gen_id;
4638 if (napi_by_id(napi->napi_id))
4639 napi->napi_id = 0;
4640 }
4641
4642 hlist_add_head_rcu(&napi->napi_hash_node,
4643 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4644
4645 spin_unlock(&napi_hash_lock);
4646 }
4647}
4648EXPORT_SYMBOL_GPL(napi_hash_add);
4649
4650/* Warning : caller is responsible to make sure rcu grace period
4651 * is respected before freeing memory containing @napi
4652 */
4653void napi_hash_del(struct napi_struct *napi)
4654{
4655 spin_lock(&napi_hash_lock);
4656
4657 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4658 hlist_del_rcu(&napi->napi_hash_node);
4659
4660 spin_unlock(&napi_hash_lock);
4661}
4662EXPORT_SYMBOL_GPL(napi_hash_del);
4663
3b47d303
ED
4664static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4665{
4666 struct napi_struct *napi;
4667
4668 napi = container_of(timer, struct napi_struct, timer);
4669 if (napi->gro_list)
4670 napi_schedule(napi);
4671
4672 return HRTIMER_NORESTART;
4673}
4674
d565b0a1
HX
4675void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4676 int (*poll)(struct napi_struct *, int), int weight)
4677{
4678 INIT_LIST_HEAD(&napi->poll_list);
3b47d303
ED
4679 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4680 napi->timer.function = napi_watchdog;
4ae5544f 4681 napi->gro_count = 0;
d565b0a1 4682 napi->gro_list = NULL;
5d38a079 4683 napi->skb = NULL;
d565b0a1 4684 napi->poll = poll;
82dc3c63
ED
4685 if (weight > NAPI_POLL_WEIGHT)
4686 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4687 weight, dev->name);
d565b0a1
HX
4688 napi->weight = weight;
4689 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 4690 napi->dev = dev;
5d38a079 4691#ifdef CONFIG_NETPOLL
d565b0a1
HX
4692 spin_lock_init(&napi->poll_lock);
4693 napi->poll_owner = -1;
4694#endif
4695 set_bit(NAPI_STATE_SCHED, &napi->state);
4696}
4697EXPORT_SYMBOL(netif_napi_add);
4698
3b47d303
ED
4699void napi_disable(struct napi_struct *n)
4700{
4701 might_sleep();
4702 set_bit(NAPI_STATE_DISABLE, &n->state);
4703
4704 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4705 msleep(1);
4706
4707 hrtimer_cancel(&n->timer);
4708
4709 clear_bit(NAPI_STATE_DISABLE, &n->state);
4710}
4711EXPORT_SYMBOL(napi_disable);
4712
d565b0a1
HX
4713void netif_napi_del(struct napi_struct *napi)
4714{
d7b06636 4715 list_del_init(&napi->dev_list);
76620aaf 4716 napi_free_frags(napi);
d565b0a1 4717
289dccbe 4718 kfree_skb_list(napi->gro_list);
d565b0a1 4719 napi->gro_list = NULL;
4ae5544f 4720 napi->gro_count = 0;
d565b0a1
HX
4721}
4722EXPORT_SYMBOL(netif_napi_del);
4723
726ce70e
HX
4724static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4725{
4726 void *have;
4727 int work, weight;
4728
4729 list_del_init(&n->poll_list);
4730
4731 have = netpoll_poll_lock(n);
4732
4733 weight = n->weight;
4734
4735 /* This NAPI_STATE_SCHED test is for avoiding a race
4736 * with netpoll's poll_napi(). Only the entity which
4737 * obtains the lock and sees NAPI_STATE_SCHED set will
4738 * actually make the ->poll() call. Therefore we avoid
4739 * accidentally calling ->poll() when NAPI is not scheduled.
4740 */
4741 work = 0;
4742 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4743 work = n->poll(n, weight);
4744 trace_napi_poll(n);
4745 }
4746
4747 WARN_ON_ONCE(work > weight);
4748
4749 if (likely(work < weight))
4750 goto out_unlock;
4751
4752 /* Drivers must not modify the NAPI state if they
4753 * consume the entire weight. In such cases this code
4754 * still "owns" the NAPI instance and therefore can
4755 * move the instance around on the list at-will.
4756 */
4757 if (unlikely(napi_disable_pending(n))) {
4758 napi_complete(n);
4759 goto out_unlock;
4760 }
4761
4762 if (n->gro_list) {
4763 /* flush too old packets
4764 * If HZ < 1000, flush all packets.
4765 */
4766 napi_gro_flush(n, HZ >= 1000);
4767 }
4768
001ce546
HX
4769 /* Some drivers may have called napi_schedule
4770 * prior to exhausting their budget.
4771 */
4772 if (unlikely(!list_empty(&n->poll_list))) {
4773 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4774 n->dev ? n->dev->name : "backlog");
4775 goto out_unlock;
4776 }
4777
726ce70e
HX
4778 list_add_tail(&n->poll_list, repoll);
4779
4780out_unlock:
4781 netpoll_poll_unlock(have);
4782
4783 return work;
4784}
4785
1da177e4
LT
4786static void net_rx_action(struct softirq_action *h)
4787{
903ceff7 4788 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
24f8b238 4789 unsigned long time_limit = jiffies + 2;
51b0bded 4790 int budget = netdev_budget;
d75b1ade
ED
4791 LIST_HEAD(list);
4792 LIST_HEAD(repoll);
53fb95d3 4793
1da177e4 4794 local_irq_disable();
d75b1ade
ED
4795 list_splice_init(&sd->poll_list, &list);
4796 local_irq_enable();
1da177e4 4797
ceb8d5bf 4798 for (;;) {
bea3348e 4799 struct napi_struct *n;
1da177e4 4800
ceb8d5bf
HX
4801 if (list_empty(&list)) {
4802 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4803 return;
4804 break;
4805 }
4806
6bd373eb
HX
4807 n = list_first_entry(&list, struct napi_struct, poll_list);
4808 budget -= napi_poll(n, &repoll);
4809
d75b1ade 4810 /* If softirq window is exhausted then punt.
24f8b238
SH
4811 * Allow this to run for 2 jiffies since which will allow
4812 * an average latency of 1.5/HZ.
bea3348e 4813 */
ceb8d5bf
HX
4814 if (unlikely(budget <= 0 ||
4815 time_after_eq(jiffies, time_limit))) {
4816 sd->time_squeeze++;
4817 break;
4818 }
1da177e4 4819 }
d75b1ade 4820
d75b1ade
ED
4821 local_irq_disable();
4822
4823 list_splice_tail_init(&sd->poll_list, &list);
4824 list_splice_tail(&repoll, &list);
4825 list_splice(&list, &sd->poll_list);
4826 if (!list_empty(&sd->poll_list))
4827 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4828
e326bed2 4829 net_rps_action_and_irq_enable(sd);
1da177e4
LT
4830}
4831
aa9d8560 4832struct netdev_adjacent {
9ff162a8 4833 struct net_device *dev;
5d261913
VF
4834
4835 /* upper master flag, there can only be one master device per list */
9ff162a8 4836 bool master;
5d261913 4837
5d261913
VF
4838 /* counter for the number of times this device was added to us */
4839 u16 ref_nr;
4840
402dae96
VF
4841 /* private field for the users */
4842 void *private;
4843
9ff162a8
JP
4844 struct list_head list;
4845 struct rcu_head rcu;
9ff162a8
JP
4846};
4847
5d261913
VF
4848static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4849 struct net_device *adj_dev,
2f268f12 4850 struct list_head *adj_list)
9ff162a8 4851{
5d261913 4852 struct netdev_adjacent *adj;
5d261913 4853
2f268f12 4854 list_for_each_entry(adj, adj_list, list) {
5d261913
VF
4855 if (adj->dev == adj_dev)
4856 return adj;
9ff162a8
JP
4857 }
4858 return NULL;
4859}
4860
4861/**
4862 * netdev_has_upper_dev - Check if device is linked to an upper device
4863 * @dev: device
4864 * @upper_dev: upper device to check
4865 *
4866 * Find out if a device is linked to specified upper device and return true
4867 * in case it is. Note that this checks only immediate upper device,
4868 * not through a complete stack of devices. The caller must hold the RTNL lock.
4869 */
4870bool netdev_has_upper_dev(struct net_device *dev,
4871 struct net_device *upper_dev)
4872{
4873 ASSERT_RTNL();
4874
2f268f12 4875 return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
9ff162a8
JP
4876}
4877EXPORT_SYMBOL(netdev_has_upper_dev);
4878
4879/**
4880 * netdev_has_any_upper_dev - Check if device is linked to some device
4881 * @dev: device
4882 *
4883 * Find out if a device is linked to an upper device and return true in case
4884 * it is. The caller must hold the RTNL lock.
4885 */
1d143d9f 4886static bool netdev_has_any_upper_dev(struct net_device *dev)
9ff162a8
JP
4887{
4888 ASSERT_RTNL();
4889
2f268f12 4890 return !list_empty(&dev->all_adj_list.upper);
9ff162a8 4891}
9ff162a8
JP
4892
4893/**
4894 * netdev_master_upper_dev_get - Get master upper device
4895 * @dev: device
4896 *
4897 * Find a master upper device and return pointer to it or NULL in case
4898 * it's not there. The caller must hold the RTNL lock.
4899 */
4900struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4901{
aa9d8560 4902 struct netdev_adjacent *upper;
9ff162a8
JP
4903
4904 ASSERT_RTNL();
4905
2f268f12 4906 if (list_empty(&dev->adj_list.upper))
9ff162a8
JP
4907 return NULL;
4908
2f268f12 4909 upper = list_first_entry(&dev->adj_list.upper,
aa9d8560 4910 struct netdev_adjacent, list);
9ff162a8
JP
4911 if (likely(upper->master))
4912 return upper->dev;
4913 return NULL;
4914}
4915EXPORT_SYMBOL(netdev_master_upper_dev_get);
4916
b6ccba4c
VF
4917void *netdev_adjacent_get_private(struct list_head *adj_list)
4918{
4919 struct netdev_adjacent *adj;
4920
4921 adj = list_entry(adj_list, struct netdev_adjacent, list);
4922
4923 return adj->private;
4924}
4925EXPORT_SYMBOL(netdev_adjacent_get_private);
4926
44a40855
VY
4927/**
4928 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4929 * @dev: device
4930 * @iter: list_head ** of the current position
4931 *
4932 * Gets the next device from the dev's upper list, starting from iter
4933 * position. The caller must hold RCU read lock.
4934 */
4935struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4936 struct list_head **iter)
4937{
4938 struct netdev_adjacent *upper;
4939
4940 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4941
4942 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4943
4944 if (&upper->list == &dev->adj_list.upper)
4945 return NULL;
4946
4947 *iter = &upper->list;
4948
4949 return upper->dev;
4950}
4951EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4952
31088a11
VF
4953/**
4954 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
48311f46
VF
4955 * @dev: device
4956 * @iter: list_head ** of the current position
4957 *
4958 * Gets the next device from the dev's upper list, starting from iter
4959 * position. The caller must hold RCU read lock.
4960 */
2f268f12
VF
4961struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4962 struct list_head **iter)
48311f46
VF
4963{
4964 struct netdev_adjacent *upper;
4965
85328240 4966 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
48311f46
VF
4967
4968 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4969
2f268f12 4970 if (&upper->list == &dev->all_adj_list.upper)
48311f46
VF
4971 return NULL;
4972
4973 *iter = &upper->list;
4974
4975 return upper->dev;
4976}
2f268f12 4977EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
48311f46 4978
31088a11
VF
4979/**
4980 * netdev_lower_get_next_private - Get the next ->private from the
4981 * lower neighbour list
4982 * @dev: device
4983 * @iter: list_head ** of the current position
4984 *
4985 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4986 * list, starting from iter position. The caller must hold either hold the
4987 * RTNL lock or its own locking that guarantees that the neighbour lower
4988 * list will remain unchainged.
4989 */
4990void *netdev_lower_get_next_private(struct net_device *dev,
4991 struct list_head **iter)
4992{
4993 struct netdev_adjacent *lower;
4994
4995 lower = list_entry(*iter, struct netdev_adjacent, list);
4996
4997 if (&lower->list == &dev->adj_list.lower)
4998 return NULL;
4999
6859e7df 5000 *iter = lower->list.next;
31088a11
VF
5001
5002 return lower->private;
5003}
5004EXPORT_SYMBOL(netdev_lower_get_next_private);
5005
5006/**
5007 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5008 * lower neighbour list, RCU
5009 * variant
5010 * @dev: device
5011 * @iter: list_head ** of the current position
5012 *
5013 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5014 * list, starting from iter position. The caller must hold RCU read lock.
5015 */
5016void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5017 struct list_head **iter)
5018{
5019 struct netdev_adjacent *lower;
5020
5021 WARN_ON_ONCE(!rcu_read_lock_held());
5022
5023 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5024
5025 if (&lower->list == &dev->adj_list.lower)
5026 return NULL;
5027
6859e7df 5028 *iter = &lower->list;
31088a11
VF
5029
5030 return lower->private;
5031}
5032EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5033
4085ebe8
VY
5034/**
5035 * netdev_lower_get_next - Get the next device from the lower neighbour
5036 * list
5037 * @dev: device
5038 * @iter: list_head ** of the current position
5039 *
5040 * Gets the next netdev_adjacent from the dev's lower neighbour
5041 * list, starting from iter position. The caller must hold RTNL lock or
5042 * its own locking that guarantees that the neighbour lower
5043 * list will remain unchainged.
5044 */
5045void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5046{
5047 struct netdev_adjacent *lower;
5048
5049 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5050
5051 if (&lower->list == &dev->adj_list.lower)
5052 return NULL;
5053
5054 *iter = &lower->list;
5055
5056 return lower->dev;
5057}
5058EXPORT_SYMBOL(netdev_lower_get_next);
5059
e001bfad 5060/**
5061 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5062 * lower neighbour list, RCU
5063 * variant
5064 * @dev: device
5065 *
5066 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5067 * list. The caller must hold RCU read lock.
5068 */
5069void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5070{
5071 struct netdev_adjacent *lower;
5072
5073 lower = list_first_or_null_rcu(&dev->adj_list.lower,
5074 struct netdev_adjacent, list);
5075 if (lower)
5076 return lower->private;
5077 return NULL;
5078}
5079EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5080
9ff162a8
JP
5081/**
5082 * netdev_master_upper_dev_get_rcu - Get master upper device
5083 * @dev: device
5084 *
5085 * Find a master upper device and return pointer to it or NULL in case
5086 * it's not there. The caller must hold the RCU read lock.
5087 */
5088struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5089{
aa9d8560 5090 struct netdev_adjacent *upper;
9ff162a8 5091
2f268f12 5092 upper = list_first_or_null_rcu(&dev->adj_list.upper,
aa9d8560 5093 struct netdev_adjacent, list);
9ff162a8
JP
5094 if (upper && likely(upper->master))
5095 return upper->dev;
5096 return NULL;
5097}
5098EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5099
0a59f3a9 5100static int netdev_adjacent_sysfs_add(struct net_device *dev,
3ee32707
VF
5101 struct net_device *adj_dev,
5102 struct list_head *dev_list)
5103{
5104 char linkname[IFNAMSIZ+7];
5105 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5106 "upper_%s" : "lower_%s", adj_dev->name);
5107 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5108 linkname);
5109}
0a59f3a9 5110static void netdev_adjacent_sysfs_del(struct net_device *dev,
3ee32707
VF
5111 char *name,
5112 struct list_head *dev_list)
5113{
5114 char linkname[IFNAMSIZ+7];
5115 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5116 "upper_%s" : "lower_%s", name);
5117 sysfs_remove_link(&(dev->dev.kobj), linkname);
5118}
5119
7ce64c79
AF
5120static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5121 struct net_device *adj_dev,
5122 struct list_head *dev_list)
5123{
5124 return (dev_list == &dev->adj_list.upper ||
5125 dev_list == &dev->adj_list.lower) &&
5126 net_eq(dev_net(dev), dev_net(adj_dev));
5127}
3ee32707 5128
5d261913
VF
5129static int __netdev_adjacent_dev_insert(struct net_device *dev,
5130 struct net_device *adj_dev,
7863c054 5131 struct list_head *dev_list,
402dae96 5132 void *private, bool master)
5d261913
VF
5133{
5134 struct netdev_adjacent *adj;
842d67a7 5135 int ret;
5d261913 5136
7863c054 5137 adj = __netdev_find_adj(dev, adj_dev, dev_list);
5d261913
VF
5138
5139 if (adj) {
5d261913
VF
5140 adj->ref_nr++;
5141 return 0;
5142 }
5143
5144 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5145 if (!adj)
5146 return -ENOMEM;
5147
5148 adj->dev = adj_dev;
5149 adj->master = master;
5d261913 5150 adj->ref_nr = 1;
402dae96 5151 adj->private = private;
5d261913 5152 dev_hold(adj_dev);
2f268f12
VF
5153
5154 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5155 adj_dev->name, dev->name, adj_dev->name);
5d261913 5156
7ce64c79 5157 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
3ee32707 5158 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5831d66e
VF
5159 if (ret)
5160 goto free_adj;
5161 }
5162
7863c054 5163 /* Ensure that master link is always the first item in list. */
842d67a7
VF
5164 if (master) {
5165 ret = sysfs_create_link(&(dev->dev.kobj),
5166 &(adj_dev->dev.kobj), "master");
5167 if (ret)
5831d66e 5168 goto remove_symlinks;
842d67a7 5169
7863c054 5170 list_add_rcu(&adj->list, dev_list);
842d67a7 5171 } else {
7863c054 5172 list_add_tail_rcu(&adj->list, dev_list);
842d67a7 5173 }
5d261913
VF
5174
5175 return 0;
842d67a7 5176
5831d66e 5177remove_symlinks:
7ce64c79 5178 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
3ee32707 5179 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
842d67a7
VF
5180free_adj:
5181 kfree(adj);
974daef7 5182 dev_put(adj_dev);
842d67a7
VF
5183
5184 return ret;
5d261913
VF
5185}
5186
1d143d9f 5187static void __netdev_adjacent_dev_remove(struct net_device *dev,
5188 struct net_device *adj_dev,
5189 struct list_head *dev_list)
5d261913
VF
5190{
5191 struct netdev_adjacent *adj;
5192
7863c054 5193 adj = __netdev_find_adj(dev, adj_dev, dev_list);
5d261913 5194
2f268f12
VF
5195 if (!adj) {
5196 pr_err("tried to remove device %s from %s\n",
5197 dev->name, adj_dev->name);
5d261913 5198 BUG();
2f268f12 5199 }
5d261913
VF
5200
5201 if (adj->ref_nr > 1) {
2f268f12
VF
5202 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5203 adj->ref_nr-1);
5d261913
VF
5204 adj->ref_nr--;
5205 return;
5206 }
5207
842d67a7
VF
5208 if (adj->master)
5209 sysfs_remove_link(&(dev->dev.kobj), "master");
5210
7ce64c79 5211 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
3ee32707 5212 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5831d66e 5213
5d261913 5214 list_del_rcu(&adj->list);
2f268f12
VF
5215 pr_debug("dev_put for %s, because link removed from %s to %s\n",
5216 adj_dev->name, dev->name, adj_dev->name);
5d261913
VF
5217 dev_put(adj_dev);
5218 kfree_rcu(adj, rcu);
5219}
5220
1d143d9f 5221static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5222 struct net_device *upper_dev,
5223 struct list_head *up_list,
5224 struct list_head *down_list,
5225 void *private, bool master)
5d261913
VF
5226{
5227 int ret;
5228
402dae96
VF
5229 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5230 master);
5d261913
VF
5231 if (ret)
5232 return ret;
5233
402dae96
VF
5234 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5235 false);
5d261913 5236 if (ret) {
2f268f12 5237 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5d261913
VF
5238 return ret;
5239 }
5240
5241 return 0;
5242}
5243
1d143d9f 5244static int __netdev_adjacent_dev_link(struct net_device *dev,
5245 struct net_device *upper_dev)
5d261913 5246{
2f268f12
VF
5247 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5248 &dev->all_adj_list.upper,
5249 &upper_dev->all_adj_list.lower,
402dae96 5250 NULL, false);
5d261913
VF
5251}
5252
1d143d9f 5253static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5254 struct net_device *upper_dev,
5255 struct list_head *up_list,
5256 struct list_head *down_list)
5d261913 5257{
2f268f12
VF
5258 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5259 __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5d261913
VF
5260}
5261
1d143d9f 5262static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5263 struct net_device *upper_dev)
5d261913 5264{
2f268f12
VF
5265 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5266 &dev->all_adj_list.upper,
5267 &upper_dev->all_adj_list.lower);
5268}
5269
1d143d9f 5270static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5271 struct net_device *upper_dev,
5272 void *private, bool master)
2f268f12
VF
5273{
5274 int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5275
5276 if (ret)
5277 return ret;
5278
5279 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5280 &dev->adj_list.upper,
5281 &upper_dev->adj_list.lower,
402dae96 5282 private, master);
2f268f12
VF
5283 if (ret) {
5284 __netdev_adjacent_dev_unlink(dev, upper_dev);
5285 return ret;
5286 }
5287
5288 return 0;
5d261913
VF
5289}
5290
1d143d9f 5291static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5292 struct net_device *upper_dev)
2f268f12
VF
5293{
5294 __netdev_adjacent_dev_unlink(dev, upper_dev);
5295 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5296 &dev->adj_list.upper,
5297 &upper_dev->adj_list.lower);
5298}
5d261913 5299
9ff162a8 5300static int __netdev_upper_dev_link(struct net_device *dev,
402dae96
VF
5301 struct net_device *upper_dev, bool master,
5302 void *private)
9ff162a8 5303{
5d261913
VF
5304 struct netdev_adjacent *i, *j, *to_i, *to_j;
5305 int ret = 0;
9ff162a8
JP
5306
5307 ASSERT_RTNL();
5308
5309 if (dev == upper_dev)
5310 return -EBUSY;
5311
5312 /* To prevent loops, check if dev is not upper device to upper_dev. */
2f268f12 5313 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
9ff162a8
JP
5314 return -EBUSY;
5315
d66bf7dd 5316 if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
9ff162a8
JP
5317 return -EEXIST;
5318
5319 if (master && netdev_master_upper_dev_get(dev))
5320 return -EBUSY;
5321
402dae96
VF
5322 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5323 master);
5d261913
VF
5324 if (ret)
5325 return ret;
9ff162a8 5326
5d261913 5327 /* Now that we linked these devs, make all the upper_dev's
2f268f12 5328 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5d261913
VF
5329 * versa, and don't forget the devices itself. All of these
5330 * links are non-neighbours.
5331 */
2f268f12
VF
5332 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5333 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5334 pr_debug("Interlinking %s with %s, non-neighbour\n",
5335 i->dev->name, j->dev->name);
5d261913
VF
5336 ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5337 if (ret)
5338 goto rollback_mesh;
5339 }
5340 }
5341
5342 /* add dev to every upper_dev's upper device */
2f268f12
VF
5343 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5344 pr_debug("linking %s's upper device %s with %s\n",
5345 upper_dev->name, i->dev->name, dev->name);
5d261913
VF
5346 ret = __netdev_adjacent_dev_link(dev, i->dev);
5347 if (ret)
5348 goto rollback_upper_mesh;
5349 }
5350
5351 /* add upper_dev to every dev's lower device */
2f268f12
VF
5352 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5353 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5354 i->dev->name, upper_dev->name);
5d261913
VF
5355 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5356 if (ret)
5357 goto rollback_lower_mesh;
5358 }
9ff162a8 5359
42e52bf9 5360 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
9ff162a8 5361 return 0;
5d261913
VF
5362
5363rollback_lower_mesh:
5364 to_i = i;
2f268f12 5365 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5d261913
VF
5366 if (i == to_i)
5367 break;
5368 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5369 }
5370
5371 i = NULL;
5372
5373rollback_upper_mesh:
5374 to_i = i;
2f268f12 5375 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5d261913
VF
5376 if (i == to_i)
5377 break;
5378 __netdev_adjacent_dev_unlink(dev, i->dev);
5379 }
5380
5381 i = j = NULL;
5382
5383rollback_mesh:
5384 to_i = i;
5385 to_j = j;
2f268f12
VF
5386 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5387 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5d261913
VF
5388 if (i == to_i && j == to_j)
5389 break;
5390 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5391 }
5392 if (i == to_i)
5393 break;
5394 }
5395
2f268f12 5396 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5d261913
VF
5397
5398 return ret;
9ff162a8
JP
5399}
5400
5401/**
5402 * netdev_upper_dev_link - Add a link to the upper device
5403 * @dev: device
5404 * @upper_dev: new upper device
5405 *
5406 * Adds a link to device which is upper to this one. The caller must hold
5407 * the RTNL lock. On a failure a negative errno code is returned.
5408 * On success the reference counts are adjusted and the function
5409 * returns zero.
5410 */
5411int netdev_upper_dev_link(struct net_device *dev,
5412 struct net_device *upper_dev)
5413{
402dae96 5414 return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
9ff162a8
JP
5415}
5416EXPORT_SYMBOL(netdev_upper_dev_link);
5417
5418/**
5419 * netdev_master_upper_dev_link - Add a master link to the upper device
5420 * @dev: device
5421 * @upper_dev: new upper device
5422 *
5423 * Adds a link to device which is upper to this one. In this case, only
5424 * one master upper device can be linked, although other non-master devices
5425 * might be linked as well. The caller must hold the RTNL lock.
5426 * On a failure a negative errno code is returned. On success the reference
5427 * counts are adjusted and the function returns zero.
5428 */
5429int netdev_master_upper_dev_link(struct net_device *dev,
5430 struct net_device *upper_dev)
5431{
402dae96 5432 return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
9ff162a8
JP
5433}
5434EXPORT_SYMBOL(netdev_master_upper_dev_link);
5435
402dae96
VF
5436int netdev_master_upper_dev_link_private(struct net_device *dev,
5437 struct net_device *upper_dev,
5438 void *private)
5439{
5440 return __netdev_upper_dev_link(dev, upper_dev, true, private);
5441}
5442EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5443
9ff162a8
JP
5444/**
5445 * netdev_upper_dev_unlink - Removes a link to upper device
5446 * @dev: device
5447 * @upper_dev: new upper device
5448 *
5449 * Removes a link to device which is upper to this one. The caller must hold
5450 * the RTNL lock.
5451 */
5452void netdev_upper_dev_unlink(struct net_device *dev,
5453 struct net_device *upper_dev)
5454{
5d261913 5455 struct netdev_adjacent *i, *j;
9ff162a8
JP
5456 ASSERT_RTNL();
5457
2f268f12 5458 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5d261913
VF
5459
5460 /* Here is the tricky part. We must remove all dev's lower
5461 * devices from all upper_dev's upper devices and vice
5462 * versa, to maintain the graph relationship.
5463 */
2f268f12
VF
5464 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5465 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5d261913
VF
5466 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5467
5468 /* remove also the devices itself from lower/upper device
5469 * list
5470 */
2f268f12 5471 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5d261913
VF
5472 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5473
2f268f12 5474 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5d261913
VF
5475 __netdev_adjacent_dev_unlink(dev, i->dev);
5476
42e52bf9 5477 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
9ff162a8
JP
5478}
5479EXPORT_SYMBOL(netdev_upper_dev_unlink);
5480
61bd3857
MS
5481/**
5482 * netdev_bonding_info_change - Dispatch event about slave change
5483 * @dev: device
4a26e453 5484 * @bonding_info: info to dispatch
61bd3857
MS
5485 *
5486 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5487 * The caller must hold the RTNL lock.
5488 */
5489void netdev_bonding_info_change(struct net_device *dev,
5490 struct netdev_bonding_info *bonding_info)
5491{
5492 struct netdev_notifier_bonding_info info;
5493
5494 memcpy(&info.bonding_info, bonding_info,
5495 sizeof(struct netdev_bonding_info));
5496 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5497 &info.info);
5498}
5499EXPORT_SYMBOL(netdev_bonding_info_change);
5500
2ce1ee17 5501static void netdev_adjacent_add_links(struct net_device *dev)
4c75431a
AF
5502{
5503 struct netdev_adjacent *iter;
5504
5505 struct net *net = dev_net(dev);
5506
5507 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5508 if (!net_eq(net,dev_net(iter->dev)))
5509 continue;
5510 netdev_adjacent_sysfs_add(iter->dev, dev,
5511 &iter->dev->adj_list.lower);
5512 netdev_adjacent_sysfs_add(dev, iter->dev,
5513 &dev->adj_list.upper);
5514 }
5515
5516 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5517 if (!net_eq(net,dev_net(iter->dev)))
5518 continue;
5519 netdev_adjacent_sysfs_add(iter->dev, dev,
5520 &iter->dev->adj_list.upper);
5521 netdev_adjacent_sysfs_add(dev, iter->dev,
5522 &dev->adj_list.lower);
5523 }
5524}
5525
2ce1ee17 5526static void netdev_adjacent_del_links(struct net_device *dev)
4c75431a
AF
5527{
5528 struct netdev_adjacent *iter;
5529
5530 struct net *net = dev_net(dev);
5531
5532 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5533 if (!net_eq(net,dev_net(iter->dev)))
5534 continue;
5535 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5536 &iter->dev->adj_list.lower);
5537 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5538 &dev->adj_list.upper);
5539 }
5540
5541 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5542 if (!net_eq(net,dev_net(iter->dev)))
5543 continue;
5544 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5545 &iter->dev->adj_list.upper);
5546 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5547 &dev->adj_list.lower);
5548 }
5549}
5550
5bb025fa 5551void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
402dae96 5552{
5bb025fa 5553 struct netdev_adjacent *iter;
402dae96 5554
4c75431a
AF
5555 struct net *net = dev_net(dev);
5556
5bb025fa 5557 list_for_each_entry(iter, &dev->adj_list.upper, list) {
4c75431a
AF
5558 if (!net_eq(net,dev_net(iter->dev)))
5559 continue;
5bb025fa
VF
5560 netdev_adjacent_sysfs_del(iter->dev, oldname,
5561 &iter->dev->adj_list.lower);
5562 netdev_adjacent_sysfs_add(iter->dev, dev,
5563 &iter->dev->adj_list.lower);
5564 }
402dae96 5565
5bb025fa 5566 list_for_each_entry(iter, &dev->adj_list.lower, list) {
4c75431a
AF
5567 if (!net_eq(net,dev_net(iter->dev)))
5568 continue;
5bb025fa
VF
5569 netdev_adjacent_sysfs_del(iter->dev, oldname,
5570 &iter->dev->adj_list.upper);
5571 netdev_adjacent_sysfs_add(iter->dev, dev,
5572 &iter->dev->adj_list.upper);
5573 }
402dae96 5574}
402dae96
VF
5575
5576void *netdev_lower_dev_get_private(struct net_device *dev,
5577 struct net_device *lower_dev)
5578{
5579 struct netdev_adjacent *lower;
5580
5581 if (!lower_dev)
5582 return NULL;
5583 lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5584 if (!lower)
5585 return NULL;
5586
5587 return lower->private;
5588}
5589EXPORT_SYMBOL(netdev_lower_dev_get_private);
5590
4085ebe8
VY
5591
5592int dev_get_nest_level(struct net_device *dev,
5593 bool (*type_check)(struct net_device *dev))
5594{
5595 struct net_device *lower = NULL;
5596 struct list_head *iter;
5597 int max_nest = -1;
5598 int nest;
5599
5600 ASSERT_RTNL();
5601
5602 netdev_for_each_lower_dev(dev, lower, iter) {
5603 nest = dev_get_nest_level(lower, type_check);
5604 if (max_nest < nest)
5605 max_nest = nest;
5606 }
5607
5608 if (type_check(dev))
5609 max_nest++;
5610
5611 return max_nest;
5612}
5613EXPORT_SYMBOL(dev_get_nest_level);
5614
b6c40d68
PM
5615static void dev_change_rx_flags(struct net_device *dev, int flags)
5616{
d314774c
SH
5617 const struct net_device_ops *ops = dev->netdev_ops;
5618
d2615bf4 5619 if (ops->ndo_change_rx_flags)
d314774c 5620 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
5621}
5622
991fb3f7 5623static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
1da177e4 5624{
b536db93 5625 unsigned int old_flags = dev->flags;
d04a48b0
EB
5626 kuid_t uid;
5627 kgid_t gid;
1da177e4 5628
24023451
PM
5629 ASSERT_RTNL();
5630
dad9b335
WC
5631 dev->flags |= IFF_PROMISC;
5632 dev->promiscuity += inc;
5633 if (dev->promiscuity == 0) {
5634 /*
5635 * Avoid overflow.
5636 * If inc causes overflow, untouch promisc and return error.
5637 */
5638 if (inc < 0)
5639 dev->flags &= ~IFF_PROMISC;
5640 else {
5641 dev->promiscuity -= inc;
7b6cd1ce
JP
5642 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5643 dev->name);
dad9b335
WC
5644 return -EOVERFLOW;
5645 }
5646 }
52609c0b 5647 if (dev->flags != old_flags) {
7b6cd1ce
JP
5648 pr_info("device %s %s promiscuous mode\n",
5649 dev->name,
5650 dev->flags & IFF_PROMISC ? "entered" : "left");
8192b0c4
DH
5651 if (audit_enabled) {
5652 current_uid_gid(&uid, &gid);
7759db82
KHK
5653 audit_log(current->audit_context, GFP_ATOMIC,
5654 AUDIT_ANOM_PROMISCUOUS,
5655 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5656 dev->name, (dev->flags & IFF_PROMISC),
5657 (old_flags & IFF_PROMISC),
e1760bd5 5658 from_kuid(&init_user_ns, audit_get_loginuid(current)),
d04a48b0
EB
5659 from_kuid(&init_user_ns, uid),
5660 from_kgid(&init_user_ns, gid),
7759db82 5661 audit_get_sessionid(current));
8192b0c4 5662 }
24023451 5663
b6c40d68 5664 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 5665 }
991fb3f7
ND
5666 if (notify)
5667 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
dad9b335 5668 return 0;
1da177e4
LT
5669}
5670
4417da66
PM
5671/**
5672 * dev_set_promiscuity - update promiscuity count on a device
5673 * @dev: device
5674 * @inc: modifier
5675 *
5676 * Add or remove promiscuity from a device. While the count in the device
5677 * remains above zero the interface remains promiscuous. Once it hits zero
5678 * the device reverts back to normal filtering operation. A negative inc
5679 * value is used to drop promiscuity on the device.
dad9b335 5680 * Return 0 if successful or a negative errno code on error.
4417da66 5681 */
dad9b335 5682int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66 5683{
b536db93 5684 unsigned int old_flags = dev->flags;
dad9b335 5685 int err;
4417da66 5686
991fb3f7 5687 err = __dev_set_promiscuity(dev, inc, true);
4b5a698e 5688 if (err < 0)
dad9b335 5689 return err;
4417da66
PM
5690 if (dev->flags != old_flags)
5691 dev_set_rx_mode(dev);
dad9b335 5692 return err;
4417da66 5693}
d1b19dff 5694EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 5695
991fb3f7 5696static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
1da177e4 5697{
991fb3f7 5698 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
1da177e4 5699
24023451
PM
5700 ASSERT_RTNL();
5701
1da177e4 5702 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
5703 dev->allmulti += inc;
5704 if (dev->allmulti == 0) {
5705 /*
5706 * Avoid overflow.
5707 * If inc causes overflow, untouch allmulti and return error.
5708 */
5709 if (inc < 0)
5710 dev->flags &= ~IFF_ALLMULTI;
5711 else {
5712 dev->allmulti -= inc;
7b6cd1ce
JP
5713 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5714 dev->name);
dad9b335
WC
5715 return -EOVERFLOW;
5716 }
5717 }
24023451 5718 if (dev->flags ^ old_flags) {
b6c40d68 5719 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 5720 dev_set_rx_mode(dev);
991fb3f7
ND
5721 if (notify)
5722 __dev_notify_flags(dev, old_flags,
5723 dev->gflags ^ old_gflags);
24023451 5724 }
dad9b335 5725 return 0;
4417da66 5726}
991fb3f7
ND
5727
5728/**
5729 * dev_set_allmulti - update allmulti count on a device
5730 * @dev: device
5731 * @inc: modifier
5732 *
5733 * Add or remove reception of all multicast frames to a device. While the
5734 * count in the device remains above zero the interface remains listening
5735 * to all interfaces. Once it hits zero the device reverts back to normal
5736 * filtering operation. A negative @inc value is used to drop the counter
5737 * when releasing a resource needing all multicasts.
5738 * Return 0 if successful or a negative errno code on error.
5739 */
5740
5741int dev_set_allmulti(struct net_device *dev, int inc)
5742{
5743 return __dev_set_allmulti(dev, inc, true);
5744}
d1b19dff 5745EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
5746
5747/*
5748 * Upload unicast and multicast address lists to device and
5749 * configure RX filtering. When the device doesn't support unicast
53ccaae1 5750 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
5751 * are present.
5752 */
5753void __dev_set_rx_mode(struct net_device *dev)
5754{
d314774c
SH
5755 const struct net_device_ops *ops = dev->netdev_ops;
5756
4417da66
PM
5757 /* dev_open will call this function so the list will stay sane. */
5758 if (!(dev->flags&IFF_UP))
5759 return;
5760
5761 if (!netif_device_present(dev))
40b77c94 5762 return;
4417da66 5763
01789349 5764 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4417da66
PM
5765 /* Unicast addresses changes may only happen under the rtnl,
5766 * therefore calling __dev_set_promiscuity here is safe.
5767 */
32e7bfc4 5768 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
991fb3f7 5769 __dev_set_promiscuity(dev, 1, false);
2d348d1f 5770 dev->uc_promisc = true;
32e7bfc4 5771 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
991fb3f7 5772 __dev_set_promiscuity(dev, -1, false);
2d348d1f 5773 dev->uc_promisc = false;
4417da66 5774 }
4417da66 5775 }
01789349
JP
5776
5777 if (ops->ndo_set_rx_mode)
5778 ops->ndo_set_rx_mode(dev);
4417da66
PM
5779}
5780
5781void dev_set_rx_mode(struct net_device *dev)
5782{
b9e40857 5783 netif_addr_lock_bh(dev);
4417da66 5784 __dev_set_rx_mode(dev);
b9e40857 5785 netif_addr_unlock_bh(dev);
1da177e4
LT
5786}
5787
f0db275a
SH
5788/**
5789 * dev_get_flags - get flags reported to userspace
5790 * @dev: device
5791 *
5792 * Get the combination of flag bits exported through APIs to userspace.
5793 */
95c96174 5794unsigned int dev_get_flags(const struct net_device *dev)
1da177e4 5795{
95c96174 5796 unsigned int flags;
1da177e4
LT
5797
5798 flags = (dev->flags & ~(IFF_PROMISC |
5799 IFF_ALLMULTI |
b00055aa
SR
5800 IFF_RUNNING |
5801 IFF_LOWER_UP |
5802 IFF_DORMANT)) |
1da177e4
LT
5803 (dev->gflags & (IFF_PROMISC |
5804 IFF_ALLMULTI));
5805
b00055aa
SR
5806 if (netif_running(dev)) {
5807 if (netif_oper_up(dev))
5808 flags |= IFF_RUNNING;
5809 if (netif_carrier_ok(dev))
5810 flags |= IFF_LOWER_UP;
5811 if (netif_dormant(dev))
5812 flags |= IFF_DORMANT;
5813 }
1da177e4
LT
5814
5815 return flags;
5816}
d1b19dff 5817EXPORT_SYMBOL(dev_get_flags);
1da177e4 5818
bd380811 5819int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 5820{
b536db93 5821 unsigned int old_flags = dev->flags;
bd380811 5822 int ret;
1da177e4 5823
24023451
PM
5824 ASSERT_RTNL();
5825
1da177e4
LT
5826 /*
5827 * Set the flags on our device.
5828 */
5829
5830 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5831 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5832 IFF_AUTOMEDIA)) |
5833 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5834 IFF_ALLMULTI));
5835
5836 /*
5837 * Load in the correct multicast list now the flags have changed.
5838 */
5839
b6c40d68
PM
5840 if ((old_flags ^ flags) & IFF_MULTICAST)
5841 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 5842
4417da66 5843 dev_set_rx_mode(dev);
1da177e4
LT
5844
5845 /*
5846 * Have we downed the interface. We handle IFF_UP ourselves
5847 * according to user attempts to set it, rather than blindly
5848 * setting it.
5849 */
5850
5851 ret = 0;
d215d10f 5852 if ((old_flags ^ flags) & IFF_UP)
bd380811 5853 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4 5854
1da177e4 5855 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff 5856 int inc = (flags & IFF_PROMISC) ? 1 : -1;
991fb3f7 5857 unsigned int old_flags = dev->flags;
d1b19dff 5858
1da177e4 5859 dev->gflags ^= IFF_PROMISC;
991fb3f7
ND
5860
5861 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5862 if (dev->flags != old_flags)
5863 dev_set_rx_mode(dev);
1da177e4
LT
5864 }
5865
5866 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5867 is important. Some (broken) drivers set IFF_PROMISC, when
5868 IFF_ALLMULTI is requested not asking us and not reporting.
5869 */
5870 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
5871 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5872
1da177e4 5873 dev->gflags ^= IFF_ALLMULTI;
991fb3f7 5874 __dev_set_allmulti(dev, inc, false);
1da177e4
LT
5875 }
5876
bd380811
PM
5877 return ret;
5878}
5879
a528c219
ND
5880void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5881 unsigned int gchanges)
bd380811
PM
5882{
5883 unsigned int changes = dev->flags ^ old_flags;
5884
a528c219 5885 if (gchanges)
7f294054 5886 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
a528c219 5887
bd380811
PM
5888 if (changes & IFF_UP) {
5889 if (dev->flags & IFF_UP)
5890 call_netdevice_notifiers(NETDEV_UP, dev);
5891 else
5892 call_netdevice_notifiers(NETDEV_DOWN, dev);
5893 }
5894
5895 if (dev->flags & IFF_UP &&
be9efd36
JP
5896 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5897 struct netdev_notifier_change_info change_info;
5898
5899 change_info.flags_changed = changes;
5900 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5901 &change_info.info);
5902 }
bd380811
PM
5903}
5904
5905/**
5906 * dev_change_flags - change device settings
5907 * @dev: device
5908 * @flags: device state flags
5909 *
5910 * Change settings on device based state flags. The flags are
5911 * in the userspace exported format.
5912 */
b536db93 5913int dev_change_flags(struct net_device *dev, unsigned int flags)
bd380811 5914{
b536db93 5915 int ret;
991fb3f7 5916 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
bd380811
PM
5917
5918 ret = __dev_change_flags(dev, flags);
5919 if (ret < 0)
5920 return ret;
5921
991fb3f7 5922 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
a528c219 5923 __dev_notify_flags(dev, old_flags, changes);
1da177e4
LT
5924 return ret;
5925}
d1b19dff 5926EXPORT_SYMBOL(dev_change_flags);
1da177e4 5927
2315dc91
VF
5928static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5929{
5930 const struct net_device_ops *ops = dev->netdev_ops;
5931
5932 if (ops->ndo_change_mtu)
5933 return ops->ndo_change_mtu(dev, new_mtu);
5934
5935 dev->mtu = new_mtu;
5936 return 0;
5937}
5938
f0db275a
SH
5939/**
5940 * dev_set_mtu - Change maximum transfer unit
5941 * @dev: device
5942 * @new_mtu: new transfer unit
5943 *
5944 * Change the maximum transfer size of the network device.
5945 */
1da177e4
LT
5946int dev_set_mtu(struct net_device *dev, int new_mtu)
5947{
2315dc91 5948 int err, orig_mtu;
1da177e4
LT
5949
5950 if (new_mtu == dev->mtu)
5951 return 0;
5952
5953 /* MTU must be positive. */
5954 if (new_mtu < 0)
5955 return -EINVAL;
5956
5957 if (!netif_device_present(dev))
5958 return -ENODEV;
5959
1d486bfb
VF
5960 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5961 err = notifier_to_errno(err);
5962 if (err)
5963 return err;
d314774c 5964
2315dc91
VF
5965 orig_mtu = dev->mtu;
5966 err = __dev_set_mtu(dev, new_mtu);
d314774c 5967
2315dc91
VF
5968 if (!err) {
5969 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5970 err = notifier_to_errno(err);
5971 if (err) {
5972 /* setting mtu back and notifying everyone again,
5973 * so that they have a chance to revert changes.
5974 */
5975 __dev_set_mtu(dev, orig_mtu);
5976 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5977 }
5978 }
1da177e4
LT
5979 return err;
5980}
d1b19dff 5981EXPORT_SYMBOL(dev_set_mtu);
1da177e4 5982
cbda10fa
VD
5983/**
5984 * dev_set_group - Change group this device belongs to
5985 * @dev: device
5986 * @new_group: group this device should belong to
5987 */
5988void dev_set_group(struct net_device *dev, int new_group)
5989{
5990 dev->group = new_group;
5991}
5992EXPORT_SYMBOL(dev_set_group);
5993
f0db275a
SH
5994/**
5995 * dev_set_mac_address - Change Media Access Control Address
5996 * @dev: device
5997 * @sa: new address
5998 *
5999 * Change the hardware (MAC) address of the device
6000 */
1da177e4
LT
6001int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6002{
d314774c 6003 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
6004 int err;
6005
d314774c 6006 if (!ops->ndo_set_mac_address)
1da177e4
LT
6007 return -EOPNOTSUPP;
6008 if (sa->sa_family != dev->type)
6009 return -EINVAL;
6010 if (!netif_device_present(dev))
6011 return -ENODEV;
d314774c 6012 err = ops->ndo_set_mac_address(dev, sa);
f6521516
JP
6013 if (err)
6014 return err;
fbdeca2d 6015 dev->addr_assign_type = NET_ADDR_SET;
f6521516 6016 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7bf23575 6017 add_device_randomness(dev->dev_addr, dev->addr_len);
f6521516 6018 return 0;
1da177e4 6019}
d1b19dff 6020EXPORT_SYMBOL(dev_set_mac_address);
1da177e4 6021
4bf84c35
JP
6022/**
6023 * dev_change_carrier - Change device carrier
6024 * @dev: device
691b3b7e 6025 * @new_carrier: new value
4bf84c35
JP
6026 *
6027 * Change device carrier
6028 */
6029int dev_change_carrier(struct net_device *dev, bool new_carrier)
6030{
6031 const struct net_device_ops *ops = dev->netdev_ops;
6032
6033 if (!ops->ndo_change_carrier)
6034 return -EOPNOTSUPP;
6035 if (!netif_device_present(dev))
6036 return -ENODEV;
6037 return ops->ndo_change_carrier(dev, new_carrier);
6038}
6039EXPORT_SYMBOL(dev_change_carrier);
6040
66b52b0d
JP
6041/**
6042 * dev_get_phys_port_id - Get device physical port ID
6043 * @dev: device
6044 * @ppid: port ID
6045 *
6046 * Get device physical port ID
6047 */
6048int dev_get_phys_port_id(struct net_device *dev,
02637fce 6049 struct netdev_phys_item_id *ppid)
66b52b0d
JP
6050{
6051 const struct net_device_ops *ops = dev->netdev_ops;
6052
6053 if (!ops->ndo_get_phys_port_id)
6054 return -EOPNOTSUPP;
6055 return ops->ndo_get_phys_port_id(dev, ppid);
6056}
6057EXPORT_SYMBOL(dev_get_phys_port_id);
6058
db24a904
DA
6059/**
6060 * dev_get_phys_port_name - Get device physical port name
6061 * @dev: device
6062 * @name: port name
6063 *
6064 * Get device physical port name
6065 */
6066int dev_get_phys_port_name(struct net_device *dev,
6067 char *name, size_t len)
6068{
6069 const struct net_device_ops *ops = dev->netdev_ops;
6070
6071 if (!ops->ndo_get_phys_port_name)
6072 return -EOPNOTSUPP;
6073 return ops->ndo_get_phys_port_name(dev, name, len);
6074}
6075EXPORT_SYMBOL(dev_get_phys_port_name);
6076
d746d707
AK
6077/**
6078 * dev_change_proto_down - update protocol port state information
6079 * @dev: device
6080 * @proto_down: new value
6081 *
6082 * This info can be used by switch drivers to set the phys state of the
6083 * port.
6084 */
6085int dev_change_proto_down(struct net_device *dev, bool proto_down)
6086{
6087 const struct net_device_ops *ops = dev->netdev_ops;
6088
6089 if (!ops->ndo_change_proto_down)
6090 return -EOPNOTSUPP;
6091 if (!netif_device_present(dev))
6092 return -ENODEV;
6093 return ops->ndo_change_proto_down(dev, proto_down);
6094}
6095EXPORT_SYMBOL(dev_change_proto_down);
6096
1da177e4
LT
6097/**
6098 * dev_new_index - allocate an ifindex
c4ea43c5 6099 * @net: the applicable net namespace
1da177e4
LT
6100 *
6101 * Returns a suitable unique value for a new device interface
6102 * number. The caller must hold the rtnl semaphore or the
6103 * dev_base_lock to be sure it remains unique.
6104 */
881d966b 6105static int dev_new_index(struct net *net)
1da177e4 6106{
aa79e66e 6107 int ifindex = net->ifindex;
1da177e4
LT
6108 for (;;) {
6109 if (++ifindex <= 0)
6110 ifindex = 1;
881d966b 6111 if (!__dev_get_by_index(net, ifindex))
aa79e66e 6112 return net->ifindex = ifindex;
1da177e4
LT
6113 }
6114}
6115
1da177e4 6116/* Delayed registration/unregisteration */
3b5b34fd 6117static LIST_HEAD(net_todo_list);
200b916f 6118DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
1da177e4 6119
6f05f629 6120static void net_set_todo(struct net_device *dev)
1da177e4 6121{
1da177e4 6122 list_add_tail(&dev->todo_list, &net_todo_list);
50624c93 6123 dev_net(dev)->dev_unreg_count++;
1da177e4
LT
6124}
6125
9b5e383c 6126static void rollback_registered_many(struct list_head *head)
93ee31f1 6127{
e93737b0 6128 struct net_device *dev, *tmp;
5cde2829 6129 LIST_HEAD(close_head);
9b5e383c 6130
93ee31f1
DL
6131 BUG_ON(dev_boot_phase);
6132 ASSERT_RTNL();
6133
e93737b0 6134 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 6135 /* Some devices call without registering
e93737b0
KK
6136 * for initialization unwind. Remove those
6137 * devices and proceed with the remaining.
9b5e383c
ED
6138 */
6139 if (dev->reg_state == NETREG_UNINITIALIZED) {
7b6cd1ce
JP
6140 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6141 dev->name, dev);
93ee31f1 6142
9b5e383c 6143 WARN_ON(1);
e93737b0
KK
6144 list_del(&dev->unreg_list);
6145 continue;
9b5e383c 6146 }
449f4544 6147 dev->dismantle = true;
9b5e383c 6148 BUG_ON(dev->reg_state != NETREG_REGISTERED);
44345724 6149 }
93ee31f1 6150
44345724 6151 /* If device is running, close it first. */
5cde2829
EB
6152 list_for_each_entry(dev, head, unreg_list)
6153 list_add_tail(&dev->close_list, &close_head);
99c4a26a 6154 dev_close_many(&close_head, true);
93ee31f1 6155
44345724 6156 list_for_each_entry(dev, head, unreg_list) {
9b5e383c
ED
6157 /* And unlink it from device chain. */
6158 unlist_netdevice(dev);
93ee31f1 6159
9b5e383c 6160 dev->reg_state = NETREG_UNREGISTERING;
e9e4dd32 6161 on_each_cpu(flush_backlog, dev, 1);
9b5e383c 6162 }
93ee31f1
DL
6163
6164 synchronize_net();
6165
9b5e383c 6166 list_for_each_entry(dev, head, unreg_list) {
395eea6c
MB
6167 struct sk_buff *skb = NULL;
6168
9b5e383c
ED
6169 /* Shutdown queueing discipline. */
6170 dev_shutdown(dev);
93ee31f1
DL
6171
6172
9b5e383c
ED
6173 /* Notify protocols, that we are about to destroy
6174 this device. They should clean all the things.
6175 */
6176 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 6177
395eea6c
MB
6178 if (!dev->rtnl_link_ops ||
6179 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6180 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6181 GFP_KERNEL);
6182
9b5e383c
ED
6183 /*
6184 * Flush the unicast and multicast chains
6185 */
a748ee24 6186 dev_uc_flush(dev);
22bedad3 6187 dev_mc_flush(dev);
93ee31f1 6188
9b5e383c
ED
6189 if (dev->netdev_ops->ndo_uninit)
6190 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 6191
395eea6c
MB
6192 if (skb)
6193 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
56bfa7ee 6194
9ff162a8
JP
6195 /* Notifier chain MUST detach us all upper devices. */
6196 WARN_ON(netdev_has_any_upper_dev(dev));
93ee31f1 6197
9b5e383c
ED
6198 /* Remove entries from kobject tree */
6199 netdev_unregister_kobject(dev);
024e9679
AD
6200#ifdef CONFIG_XPS
6201 /* Remove XPS queueing entries */
6202 netif_reset_xps_queues_gt(dev, 0);
6203#endif
9b5e383c 6204 }
93ee31f1 6205
850a545b 6206 synchronize_net();
395264d5 6207
a5ee1551 6208 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
6209 dev_put(dev);
6210}
6211
6212static void rollback_registered(struct net_device *dev)
6213{
6214 LIST_HEAD(single);
6215
6216 list_add(&dev->unreg_list, &single);
6217 rollback_registered_many(&single);
ceaaec98 6218 list_del(&single);
93ee31f1
DL
6219}
6220
c8f44aff
MM
6221static netdev_features_t netdev_fix_features(struct net_device *dev,
6222 netdev_features_t features)
b63365a2 6223{
57422dc5
MM
6224 /* Fix illegal checksum combinations */
6225 if ((features & NETIF_F_HW_CSUM) &&
6226 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 6227 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
57422dc5
MM
6228 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6229 }
6230
b63365a2 6231 /* TSO requires that SG is present as well. */
ea2d3688 6232 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6f404e44 6233 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
ea2d3688 6234 features &= ~NETIF_F_ALL_TSO;
b63365a2
HX
6235 }
6236
ec5f0615
PS
6237 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6238 !(features & NETIF_F_IP_CSUM)) {
6239 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6240 features &= ~NETIF_F_TSO;
6241 features &= ~NETIF_F_TSO_ECN;
6242 }
6243
6244 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6245 !(features & NETIF_F_IPV6_CSUM)) {
6246 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6247 features &= ~NETIF_F_TSO6;
6248 }
6249
31d8b9e0
BH
6250 /* TSO ECN requires that TSO is present as well. */
6251 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6252 features &= ~NETIF_F_TSO_ECN;
6253
212b573f
MM
6254 /* Software GSO depends on SG. */
6255 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6f404e44 6256 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
212b573f
MM
6257 features &= ~NETIF_F_GSO;
6258 }
6259
acd1130e 6260 /* UFO needs SG and checksumming */
b63365a2 6261 if (features & NETIF_F_UFO) {
79032644
MM
6262 /* maybe split UFO into V4 and V6? */
6263 if (!((features & NETIF_F_GEN_CSUM) ||
6264 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6265 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 6266 netdev_dbg(dev,
acd1130e 6267 "Dropping NETIF_F_UFO since no checksum offload features.\n");
b63365a2
HX
6268 features &= ~NETIF_F_UFO;
6269 }
6270
6271 if (!(features & NETIF_F_SG)) {
6f404e44 6272 netdev_dbg(dev,
acd1130e 6273 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
b63365a2
HX
6274 features &= ~NETIF_F_UFO;
6275 }
6276 }
6277
d0290214
JP
6278#ifdef CONFIG_NET_RX_BUSY_POLL
6279 if (dev->netdev_ops->ndo_busy_poll)
6280 features |= NETIF_F_BUSY_POLL;
6281 else
6282#endif
6283 features &= ~NETIF_F_BUSY_POLL;
6284
b63365a2
HX
6285 return features;
6286}
b63365a2 6287
6cb6a27c 6288int __netdev_update_features(struct net_device *dev)
5455c699 6289{
c8f44aff 6290 netdev_features_t features;
5455c699
MM
6291 int err = 0;
6292
87267485
MM
6293 ASSERT_RTNL();
6294
5455c699
MM
6295 features = netdev_get_wanted_features(dev);
6296
6297 if (dev->netdev_ops->ndo_fix_features)
6298 features = dev->netdev_ops->ndo_fix_features(dev, features);
6299
6300 /* driver might be less strict about feature dependencies */
6301 features = netdev_fix_features(dev, features);
6302
6303 if (dev->features == features)
6cb6a27c 6304 return 0;
5455c699 6305
c8f44aff
MM
6306 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6307 &dev->features, &features);
5455c699
MM
6308
6309 if (dev->netdev_ops->ndo_set_features)
6310 err = dev->netdev_ops->ndo_set_features(dev, features);
6311
6cb6a27c 6312 if (unlikely(err < 0)) {
5455c699 6313 netdev_err(dev,
c8f44aff
MM
6314 "set_features() failed (%d); wanted %pNF, left %pNF\n",
6315 err, &features, &dev->features);
6cb6a27c
MM
6316 return -1;
6317 }
6318
6319 if (!err)
6320 dev->features = features;
6321
6322 return 1;
6323}
6324
afe12cc8
MM
6325/**
6326 * netdev_update_features - recalculate device features
6327 * @dev: the device to check
6328 *
6329 * Recalculate dev->features set and send notifications if it
6330 * has changed. Should be called after driver or hardware dependent
6331 * conditions might have changed that influence the features.
6332 */
6cb6a27c
MM
6333void netdev_update_features(struct net_device *dev)
6334{
6335 if (__netdev_update_features(dev))
6336 netdev_features_change(dev);
5455c699
MM
6337}
6338EXPORT_SYMBOL(netdev_update_features);
6339
afe12cc8
MM
6340/**
6341 * netdev_change_features - recalculate device features
6342 * @dev: the device to check
6343 *
6344 * Recalculate dev->features set and send notifications even
6345 * if they have not changed. Should be called instead of
6346 * netdev_update_features() if also dev->vlan_features might
6347 * have changed to allow the changes to be propagated to stacked
6348 * VLAN devices.
6349 */
6350void netdev_change_features(struct net_device *dev)
6351{
6352 __netdev_update_features(dev);
6353 netdev_features_change(dev);
6354}
6355EXPORT_SYMBOL(netdev_change_features);
6356
fc4a7489
PM
6357/**
6358 * netif_stacked_transfer_operstate - transfer operstate
6359 * @rootdev: the root or lower level device to transfer state from
6360 * @dev: the device to transfer operstate to
6361 *
6362 * Transfer operational state from root to device. This is normally
6363 * called when a stacking relationship exists between the root
6364 * device and the device(a leaf device).
6365 */
6366void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6367 struct net_device *dev)
6368{
6369 if (rootdev->operstate == IF_OPER_DORMANT)
6370 netif_dormant_on(dev);
6371 else
6372 netif_dormant_off(dev);
6373
6374 if (netif_carrier_ok(rootdev)) {
6375 if (!netif_carrier_ok(dev))
6376 netif_carrier_on(dev);
6377 } else {
6378 if (netif_carrier_ok(dev))
6379 netif_carrier_off(dev);
6380 }
6381}
6382EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6383
a953be53 6384#ifdef CONFIG_SYSFS
1b4bf461
ED
6385static int netif_alloc_rx_queues(struct net_device *dev)
6386{
1b4bf461 6387 unsigned int i, count = dev->num_rx_queues;
bd25fa7b 6388 struct netdev_rx_queue *rx;
10595902 6389 size_t sz = count * sizeof(*rx);
1b4bf461 6390
bd25fa7b 6391 BUG_ON(count < 1);
1b4bf461 6392
10595902
PG
6393 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6394 if (!rx) {
6395 rx = vzalloc(sz);
6396 if (!rx)
6397 return -ENOMEM;
6398 }
bd25fa7b
TH
6399 dev->_rx = rx;
6400
bd25fa7b 6401 for (i = 0; i < count; i++)
fe822240 6402 rx[i].dev = dev;
1b4bf461
ED
6403 return 0;
6404}
bf264145 6405#endif
1b4bf461 6406
aa942104
CG
6407static void netdev_init_one_queue(struct net_device *dev,
6408 struct netdev_queue *queue, void *_unused)
6409{
6410 /* Initialize queue lock */
6411 spin_lock_init(&queue->_xmit_lock);
6412 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6413 queue->xmit_lock_owner = -1;
b236da69 6414 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
aa942104 6415 queue->dev = dev;
114cf580
TH
6416#ifdef CONFIG_BQL
6417 dql_init(&queue->dql, HZ);
6418#endif
aa942104
CG
6419}
6420
60877a32
ED
6421static void netif_free_tx_queues(struct net_device *dev)
6422{
4cb28970 6423 kvfree(dev->_tx);
60877a32
ED
6424}
6425
e6484930
TH
6426static int netif_alloc_netdev_queues(struct net_device *dev)
6427{
6428 unsigned int count = dev->num_tx_queues;
6429 struct netdev_queue *tx;
60877a32 6430 size_t sz = count * sizeof(*tx);
e6484930 6431
d339727c
ED
6432 if (count < 1 || count > 0xffff)
6433 return -EINVAL;
62b5942a 6434
60877a32
ED
6435 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6436 if (!tx) {
6437 tx = vzalloc(sz);
6438 if (!tx)
6439 return -ENOMEM;
6440 }
e6484930 6441 dev->_tx = tx;
1d24eb48 6442
e6484930
TH
6443 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6444 spin_lock_init(&dev->tx_global_lock);
aa942104
CG
6445
6446 return 0;
e6484930
TH
6447}
6448
a2029240
DV
6449void netif_tx_stop_all_queues(struct net_device *dev)
6450{
6451 unsigned int i;
6452
6453 for (i = 0; i < dev->num_tx_queues; i++) {
6454 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6455 netif_tx_stop_queue(txq);
6456 }
6457}
6458EXPORT_SYMBOL(netif_tx_stop_all_queues);
6459
1da177e4
LT
6460/**
6461 * register_netdevice - register a network device
6462 * @dev: device to register
6463 *
6464 * Take a completed network device structure and add it to the kernel
6465 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6466 * chain. 0 is returned on success. A negative errno code is returned
6467 * on a failure to set up the device, or if the name is a duplicate.
6468 *
6469 * Callers must hold the rtnl semaphore. You may want
6470 * register_netdev() instead of this.
6471 *
6472 * BUGS:
6473 * The locking appears insufficient to guarantee two parallel registers
6474 * will not get the same name.
6475 */
6476
6477int register_netdevice(struct net_device *dev)
6478{
1da177e4 6479 int ret;
d314774c 6480 struct net *net = dev_net(dev);
1da177e4
LT
6481
6482 BUG_ON(dev_boot_phase);
6483 ASSERT_RTNL();
6484
b17a7c17
SH
6485 might_sleep();
6486
1da177e4
LT
6487 /* When net_device's are persistent, this will be fatal. */
6488 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 6489 BUG_ON(!net);
1da177e4 6490
f1f28aa3 6491 spin_lock_init(&dev->addr_list_lock);
cf508b12 6492 netdev_set_addr_lockdep_class(dev);
1da177e4 6493
828de4f6 6494 ret = dev_get_valid_name(net, dev, dev->name);
0696c3a8
PP
6495 if (ret < 0)
6496 goto out;
6497
1da177e4 6498 /* Init, if this function is available */
d314774c
SH
6499 if (dev->netdev_ops->ndo_init) {
6500 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
6501 if (ret) {
6502 if (ret > 0)
6503 ret = -EIO;
90833aa4 6504 goto out;
1da177e4
LT
6505 }
6506 }
4ec93edb 6507
f646968f
PM
6508 if (((dev->hw_features | dev->features) &
6509 NETIF_F_HW_VLAN_CTAG_FILTER) &&
d2ed273d
MM
6510 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6511 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6512 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6513 ret = -EINVAL;
6514 goto err_uninit;
6515 }
6516
9c7dafbf
PE
6517 ret = -EBUSY;
6518 if (!dev->ifindex)
6519 dev->ifindex = dev_new_index(net);
6520 else if (__dev_get_by_index(net, dev->ifindex))
6521 goto err_uninit;
6522
5455c699
MM
6523 /* Transfer changeable features to wanted_features and enable
6524 * software offloads (GSO and GRO).
6525 */
6526 dev->hw_features |= NETIF_F_SOFT_FEATURES;
14d1232f
MM
6527 dev->features |= NETIF_F_SOFT_FEATURES;
6528 dev->wanted_features = dev->features & dev->hw_features;
1da177e4 6529
34324dc2
MM
6530 if (!(dev->flags & IFF_LOOPBACK)) {
6531 dev->hw_features |= NETIF_F_NOCACHE_COPY;
c6e1a0d1
TH
6532 }
6533
1180e7d6 6534 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
16c3ea78 6535 */
1180e7d6 6536 dev->vlan_features |= NETIF_F_HIGHDMA;
16c3ea78 6537
ee579677
PS
6538 /* Make NETIF_F_SG inheritable to tunnel devices.
6539 */
6540 dev->hw_enc_features |= NETIF_F_SG;
6541
0d89d203
SH
6542 /* Make NETIF_F_SG inheritable to MPLS.
6543 */
6544 dev->mpls_features |= NETIF_F_SG;
6545
7ffbe3fd
JB
6546 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6547 ret = notifier_to_errno(ret);
6548 if (ret)
6549 goto err_uninit;
6550
8b41d188 6551 ret = netdev_register_kobject(dev);
b17a7c17 6552 if (ret)
7ce1b0ed 6553 goto err_uninit;
b17a7c17
SH
6554 dev->reg_state = NETREG_REGISTERED;
6555
6cb6a27c 6556 __netdev_update_features(dev);
8e9b59b2 6557
1da177e4
LT
6558 /*
6559 * Default initial state at registry is that the
6560 * device is present.
6561 */
6562
6563 set_bit(__LINK_STATE_PRESENT, &dev->state);
6564
8f4cccbb
BH
6565 linkwatch_init_dev(dev);
6566
1da177e4 6567 dev_init_scheduler(dev);
1da177e4 6568 dev_hold(dev);
ce286d32 6569 list_netdevice(dev);
7bf23575 6570 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 6571
948b337e
JP
6572 /* If the device has permanent device address, driver should
6573 * set dev_addr and also addr_assign_type should be set to
6574 * NET_ADDR_PERM (default value).
6575 */
6576 if (dev->addr_assign_type == NET_ADDR_PERM)
6577 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6578
1da177e4 6579 /* Notify protocols, that a new device appeared. */
056925ab 6580 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 6581 ret = notifier_to_errno(ret);
93ee31f1
DL
6582 if (ret) {
6583 rollback_registered(dev);
6584 dev->reg_state = NETREG_UNREGISTERED;
6585 }
d90a909e
EB
6586 /*
6587 * Prevent userspace races by waiting until the network
6588 * device is fully setup before sending notifications.
6589 */
a2835763
PM
6590 if (!dev->rtnl_link_ops ||
6591 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7f294054 6592 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
1da177e4
LT
6593
6594out:
6595 return ret;
7ce1b0ed
HX
6596
6597err_uninit:
d314774c
SH
6598 if (dev->netdev_ops->ndo_uninit)
6599 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 6600 goto out;
1da177e4 6601}
d1b19dff 6602EXPORT_SYMBOL(register_netdevice);
1da177e4 6603
937f1ba5
BH
6604/**
6605 * init_dummy_netdev - init a dummy network device for NAPI
6606 * @dev: device to init
6607 *
6608 * This takes a network device structure and initialize the minimum
6609 * amount of fields so it can be used to schedule NAPI polls without
6610 * registering a full blown interface. This is to be used by drivers
6611 * that need to tie several hardware interfaces to a single NAPI
6612 * poll scheduler due to HW limitations.
6613 */
6614int init_dummy_netdev(struct net_device *dev)
6615{
6616 /* Clear everything. Note we don't initialize spinlocks
6617 * are they aren't supposed to be taken by any of the
6618 * NAPI code and this dummy netdev is supposed to be
6619 * only ever used for NAPI polls
6620 */
6621 memset(dev, 0, sizeof(struct net_device));
6622
6623 /* make sure we BUG if trying to hit standard
6624 * register/unregister code path
6625 */
6626 dev->reg_state = NETREG_DUMMY;
6627
937f1ba5
BH
6628 /* NAPI wants this */
6629 INIT_LIST_HEAD(&dev->napi_list);
6630
6631 /* a dummy interface is started by default */
6632 set_bit(__LINK_STATE_PRESENT, &dev->state);
6633 set_bit(__LINK_STATE_START, &dev->state);
6634
29b4433d
ED
6635 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6636 * because users of this 'device' dont need to change
6637 * its refcount.
6638 */
6639
937f1ba5
BH
6640 return 0;
6641}
6642EXPORT_SYMBOL_GPL(init_dummy_netdev);
6643
6644
1da177e4
LT
6645/**
6646 * register_netdev - register a network device
6647 * @dev: device to register
6648 *
6649 * Take a completed network device structure and add it to the kernel
6650 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6651 * chain. 0 is returned on success. A negative errno code is returned
6652 * on a failure to set up the device, or if the name is a duplicate.
6653 *
38b4da38 6654 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
6655 * and expands the device name if you passed a format string to
6656 * alloc_netdev.
6657 */
6658int register_netdev(struct net_device *dev)
6659{
6660 int err;
6661
6662 rtnl_lock();
1da177e4 6663 err = register_netdevice(dev);
1da177e4
LT
6664 rtnl_unlock();
6665 return err;
6666}
6667EXPORT_SYMBOL(register_netdev);
6668
29b4433d
ED
6669int netdev_refcnt_read(const struct net_device *dev)
6670{
6671 int i, refcnt = 0;
6672
6673 for_each_possible_cpu(i)
6674 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6675 return refcnt;
6676}
6677EXPORT_SYMBOL(netdev_refcnt_read);
6678
2c53040f 6679/**
1da177e4 6680 * netdev_wait_allrefs - wait until all references are gone.
3de7a37b 6681 * @dev: target net_device
1da177e4
LT
6682 *
6683 * This is called when unregistering network devices.
6684 *
6685 * Any protocol or device that holds a reference should register
6686 * for netdevice notification, and cleanup and put back the
6687 * reference if they receive an UNREGISTER event.
6688 * We can get stuck here if buggy protocols don't correctly
4ec93edb 6689 * call dev_put.
1da177e4
LT
6690 */
6691static void netdev_wait_allrefs(struct net_device *dev)
6692{
6693 unsigned long rebroadcast_time, warning_time;
29b4433d 6694 int refcnt;
1da177e4 6695
e014debe
ED
6696 linkwatch_forget_dev(dev);
6697
1da177e4 6698 rebroadcast_time = warning_time = jiffies;
29b4433d
ED
6699 refcnt = netdev_refcnt_read(dev);
6700
6701 while (refcnt != 0) {
1da177e4 6702 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 6703 rtnl_lock();
1da177e4
LT
6704
6705 /* Rebroadcast unregister notification */
056925ab 6706 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
1da177e4 6707
748e2d93 6708 __rtnl_unlock();
0115e8e3 6709 rcu_barrier();
748e2d93
ED
6710 rtnl_lock();
6711
0115e8e3 6712 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
1da177e4
LT
6713 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6714 &dev->state)) {
6715 /* We must not have linkwatch events
6716 * pending on unregister. If this
6717 * happens, we simply run the queue
6718 * unscheduled, resulting in a noop
6719 * for this device.
6720 */
6721 linkwatch_run_queue();
6722 }
6723
6756ae4b 6724 __rtnl_unlock();
1da177e4
LT
6725
6726 rebroadcast_time = jiffies;
6727 }
6728
6729 msleep(250);
6730
29b4433d
ED
6731 refcnt = netdev_refcnt_read(dev);
6732
1da177e4 6733 if (time_after(jiffies, warning_time + 10 * HZ)) {
7b6cd1ce
JP
6734 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6735 dev->name, refcnt);
1da177e4
LT
6736 warning_time = jiffies;
6737 }
6738 }
6739}
6740
6741/* The sequence is:
6742 *
6743 * rtnl_lock();
6744 * ...
6745 * register_netdevice(x1);
6746 * register_netdevice(x2);
6747 * ...
6748 * unregister_netdevice(y1);
6749 * unregister_netdevice(y2);
6750 * ...
6751 * rtnl_unlock();
6752 * free_netdev(y1);
6753 * free_netdev(y2);
6754 *
58ec3b4d 6755 * We are invoked by rtnl_unlock().
1da177e4 6756 * This allows us to deal with problems:
b17a7c17 6757 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
6758 * without deadlocking with linkwatch via keventd.
6759 * 2) Since we run with the RTNL semaphore not held, we can sleep
6760 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
6761 *
6762 * We must not return until all unregister events added during
6763 * the interval the lock was held have been completed.
1da177e4 6764 */
1da177e4
LT
6765void netdev_run_todo(void)
6766{
626ab0e6 6767 struct list_head list;
1da177e4 6768
1da177e4 6769 /* Snapshot list, allow later requests */
626ab0e6 6770 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
6771
6772 __rtnl_unlock();
626ab0e6 6773
0115e8e3
ED
6774
6775 /* Wait for rcu callbacks to finish before next phase */
850a545b
EB
6776 if (!list_empty(&list))
6777 rcu_barrier();
6778
1da177e4
LT
6779 while (!list_empty(&list)) {
6780 struct net_device *dev
e5e26d75 6781 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
6782 list_del(&dev->todo_list);
6783
748e2d93 6784 rtnl_lock();
0115e8e3 6785 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
748e2d93 6786 __rtnl_unlock();
0115e8e3 6787
b17a7c17 6788 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7b6cd1ce 6789 pr_err("network todo '%s' but state %d\n",
b17a7c17
SH
6790 dev->name, dev->reg_state);
6791 dump_stack();
6792 continue;
6793 }
1da177e4 6794
b17a7c17 6795 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 6796
b17a7c17 6797 netdev_wait_allrefs(dev);
1da177e4 6798
b17a7c17 6799 /* paranoia */
29b4433d 6800 BUG_ON(netdev_refcnt_read(dev));
7866a621
SN
6801 BUG_ON(!list_empty(&dev->ptype_all));
6802 BUG_ON(!list_empty(&dev->ptype_specific));
33d480ce
ED
6803 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6804 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
547b792c 6805 WARN_ON(dev->dn_ptr);
1da177e4 6806
b17a7c17
SH
6807 if (dev->destructor)
6808 dev->destructor(dev);
9093bbb2 6809
50624c93
EB
6810 /* Report a network device has been unregistered */
6811 rtnl_lock();
6812 dev_net(dev)->dev_unreg_count--;
6813 __rtnl_unlock();
6814 wake_up(&netdev_unregistering_wq);
6815
9093bbb2
SH
6816 /* Free network device */
6817 kobject_put(&dev->dev.kobj);
1da177e4 6818 }
1da177e4
LT
6819}
6820
3cfde79c
BH
6821/* Convert net_device_stats to rtnl_link_stats64. They have the same
6822 * fields in the same order, with only the type differing.
6823 */
77a1abf5
ED
6824void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6825 const struct net_device_stats *netdev_stats)
3cfde79c
BH
6826{
6827#if BITS_PER_LONG == 64
77a1abf5
ED
6828 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6829 memcpy(stats64, netdev_stats, sizeof(*stats64));
3cfde79c
BH
6830#else
6831 size_t i, n = sizeof(*stats64) / sizeof(u64);
6832 const unsigned long *src = (const unsigned long *)netdev_stats;
6833 u64 *dst = (u64 *)stats64;
6834
6835 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6836 sizeof(*stats64) / sizeof(u64));
6837 for (i = 0; i < n; i++)
6838 dst[i] = src[i];
6839#endif
6840}
77a1abf5 6841EXPORT_SYMBOL(netdev_stats_to_stats64);
3cfde79c 6842
eeda3fd6
SH
6843/**
6844 * dev_get_stats - get network device statistics
6845 * @dev: device to get statistics from
28172739 6846 * @storage: place to store stats
eeda3fd6 6847 *
d7753516
BH
6848 * Get network statistics from device. Return @storage.
6849 * The device driver may provide its own method by setting
6850 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6851 * otherwise the internal statistics structure is used.
eeda3fd6 6852 */
d7753516
BH
6853struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6854 struct rtnl_link_stats64 *storage)
7004bf25 6855{
eeda3fd6
SH
6856 const struct net_device_ops *ops = dev->netdev_ops;
6857
28172739
ED
6858 if (ops->ndo_get_stats64) {
6859 memset(storage, 0, sizeof(*storage));
caf586e5
ED
6860 ops->ndo_get_stats64(dev, storage);
6861 } else if (ops->ndo_get_stats) {
3cfde79c 6862 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
caf586e5
ED
6863 } else {
6864 netdev_stats_to_stats64(storage, &dev->stats);
28172739 6865 }
caf586e5 6866 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
015f0688 6867 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
28172739 6868 return storage;
c45d286e 6869}
eeda3fd6 6870EXPORT_SYMBOL(dev_get_stats);
c45d286e 6871
24824a09 6872struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
dc2b4847 6873{
24824a09 6874 struct netdev_queue *queue = dev_ingress_queue(dev);
dc2b4847 6875
24824a09
ED
6876#ifdef CONFIG_NET_CLS_ACT
6877 if (queue)
6878 return queue;
6879 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6880 if (!queue)
6881 return NULL;
6882 netdev_init_one_queue(dev, queue, NULL);
2ce1ee17 6883 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
24824a09
ED
6884 queue->qdisc_sleeping = &noop_qdisc;
6885 rcu_assign_pointer(dev->ingress_queue, queue);
6886#endif
6887 return queue;
bb949fbd
DM
6888}
6889
2c60db03
ED
6890static const struct ethtool_ops default_ethtool_ops;
6891
d07d7507
SG
6892void netdev_set_default_ethtool_ops(struct net_device *dev,
6893 const struct ethtool_ops *ops)
6894{
6895 if (dev->ethtool_ops == &default_ethtool_ops)
6896 dev->ethtool_ops = ops;
6897}
6898EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6899
74d332c1
ED
6900void netdev_freemem(struct net_device *dev)
6901{
6902 char *addr = (char *)dev - dev->padded;
6903
4cb28970 6904 kvfree(addr);
74d332c1
ED
6905}
6906
1da177e4 6907/**
36909ea4 6908 * alloc_netdev_mqs - allocate network device
c835a677
TG
6909 * @sizeof_priv: size of private data to allocate space for
6910 * @name: device name format string
6911 * @name_assign_type: origin of device name
6912 * @setup: callback to initialize device
6913 * @txqs: the number of TX subqueues to allocate
6914 * @rxqs: the number of RX subqueues to allocate
1da177e4
LT
6915 *
6916 * Allocates a struct net_device with private data area for driver use
90e51adf 6917 * and performs basic initialization. Also allocates subqueue structs
36909ea4 6918 * for each queue on the device.
1da177e4 6919 */
36909ea4 6920struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
c835a677 6921 unsigned char name_assign_type,
36909ea4
TH
6922 void (*setup)(struct net_device *),
6923 unsigned int txqs, unsigned int rxqs)
1da177e4 6924{
1da177e4 6925 struct net_device *dev;
7943986c 6926 size_t alloc_size;
1ce8e7b5 6927 struct net_device *p;
1da177e4 6928
b6fe17d6
SH
6929 BUG_ON(strlen(name) >= sizeof(dev->name));
6930
36909ea4 6931 if (txqs < 1) {
7b6cd1ce 6932 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
55513fb4
TH
6933 return NULL;
6934 }
6935
a953be53 6936#ifdef CONFIG_SYSFS
36909ea4 6937 if (rxqs < 1) {
7b6cd1ce 6938 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
36909ea4
TH
6939 return NULL;
6940 }
6941#endif
6942
fd2ea0a7 6943 alloc_size = sizeof(struct net_device);
d1643d24
AD
6944 if (sizeof_priv) {
6945 /* ensure 32-byte alignment of private area */
1ce8e7b5 6946 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
6947 alloc_size += sizeof_priv;
6948 }
6949 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 6950 alloc_size += NETDEV_ALIGN - 1;
1da177e4 6951
74d332c1
ED
6952 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6953 if (!p)
6954 p = vzalloc(alloc_size);
62b5942a 6955 if (!p)
1da177e4 6956 return NULL;
1da177e4 6957
1ce8e7b5 6958 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 6959 dev->padded = (char *)dev - (char *)p;
ab9c73cc 6960
29b4433d
ED
6961 dev->pcpu_refcnt = alloc_percpu(int);
6962 if (!dev->pcpu_refcnt)
74d332c1 6963 goto free_dev;
ab9c73cc 6964
ab9c73cc 6965 if (dev_addr_init(dev))
29b4433d 6966 goto free_pcpu;
ab9c73cc 6967
22bedad3 6968 dev_mc_init(dev);
a748ee24 6969 dev_uc_init(dev);
ccffad25 6970
c346dca1 6971 dev_net_set(dev, &init_net);
1da177e4 6972
8d3bdbd5 6973 dev->gso_max_size = GSO_MAX_SIZE;
30b678d8 6974 dev->gso_max_segs = GSO_MAX_SEGS;
fcbeb976 6975 dev->gso_min_segs = 0;
8d3bdbd5 6976
8d3bdbd5
DM
6977 INIT_LIST_HEAD(&dev->napi_list);
6978 INIT_LIST_HEAD(&dev->unreg_list);
5cde2829 6979 INIT_LIST_HEAD(&dev->close_list);
8d3bdbd5 6980 INIT_LIST_HEAD(&dev->link_watch_list);
2f268f12
VF
6981 INIT_LIST_HEAD(&dev->adj_list.upper);
6982 INIT_LIST_HEAD(&dev->adj_list.lower);
6983 INIT_LIST_HEAD(&dev->all_adj_list.upper);
6984 INIT_LIST_HEAD(&dev->all_adj_list.lower);
7866a621
SN
6985 INIT_LIST_HEAD(&dev->ptype_all);
6986 INIT_LIST_HEAD(&dev->ptype_specific);
02875878 6987 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
8d3bdbd5
DM
6988 setup(dev);
6989
36909ea4
TH
6990 dev->num_tx_queues = txqs;
6991 dev->real_num_tx_queues = txqs;
ed9af2e8 6992 if (netif_alloc_netdev_queues(dev))
8d3bdbd5 6993 goto free_all;
e8a0464c 6994
a953be53 6995#ifdef CONFIG_SYSFS
36909ea4
TH
6996 dev->num_rx_queues = rxqs;
6997 dev->real_num_rx_queues = rxqs;
fe822240 6998 if (netif_alloc_rx_queues(dev))
8d3bdbd5 6999 goto free_all;
df334545 7000#endif
0a9627f2 7001
1da177e4 7002 strcpy(dev->name, name);
c835a677 7003 dev->name_assign_type = name_assign_type;
cbda10fa 7004 dev->group = INIT_NETDEV_GROUP;
2c60db03
ED
7005 if (!dev->ethtool_ops)
7006 dev->ethtool_ops = &default_ethtool_ops;
e687ad60
PN
7007
7008 nf_hook_ingress_init(dev);
7009
1da177e4 7010 return dev;
ab9c73cc 7011
8d3bdbd5
DM
7012free_all:
7013 free_netdev(dev);
7014 return NULL;
7015
29b4433d
ED
7016free_pcpu:
7017 free_percpu(dev->pcpu_refcnt);
74d332c1
ED
7018free_dev:
7019 netdev_freemem(dev);
ab9c73cc 7020 return NULL;
1da177e4 7021}
36909ea4 7022EXPORT_SYMBOL(alloc_netdev_mqs);
1da177e4
LT
7023
7024/**
7025 * free_netdev - free network device
7026 * @dev: device
7027 *
4ec93edb
YH
7028 * This function does the last stage of destroying an allocated device
7029 * interface. The reference to the device object is released.
1da177e4
LT
7030 * If this is the last reference then it will be freed.
7031 */
7032void free_netdev(struct net_device *dev)
7033{
d565b0a1
HX
7034 struct napi_struct *p, *n;
7035
60877a32 7036 netif_free_tx_queues(dev);
a953be53 7037#ifdef CONFIG_SYSFS
10595902 7038 kvfree(dev->_rx);
fe822240 7039#endif
e8a0464c 7040
33d480ce 7041 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
24824a09 7042
f001fde5
JP
7043 /* Flush device addresses */
7044 dev_addr_flush(dev);
7045
d565b0a1
HX
7046 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7047 netif_napi_del(p);
7048
29b4433d
ED
7049 free_percpu(dev->pcpu_refcnt);
7050 dev->pcpu_refcnt = NULL;
7051
3041a069 7052 /* Compatibility with error handling in drivers */
1da177e4 7053 if (dev->reg_state == NETREG_UNINITIALIZED) {
74d332c1 7054 netdev_freemem(dev);
1da177e4
LT
7055 return;
7056 }
7057
7058 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7059 dev->reg_state = NETREG_RELEASED;
7060
43cb76d9
GKH
7061 /* will free via device release */
7062 put_device(&dev->dev);
1da177e4 7063}
d1b19dff 7064EXPORT_SYMBOL(free_netdev);
4ec93edb 7065
f0db275a
SH
7066/**
7067 * synchronize_net - Synchronize with packet receive processing
7068 *
7069 * Wait for packets currently being received to be done.
7070 * Does not block later packets from starting.
7071 */
4ec93edb 7072void synchronize_net(void)
1da177e4
LT
7073{
7074 might_sleep();
be3fc413
ED
7075 if (rtnl_is_locked())
7076 synchronize_rcu_expedited();
7077 else
7078 synchronize_rcu();
1da177e4 7079}
d1b19dff 7080EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
7081
7082/**
44a0873d 7083 * unregister_netdevice_queue - remove device from the kernel
1da177e4 7084 * @dev: device
44a0873d 7085 * @head: list
6ebfbc06 7086 *
1da177e4 7087 * This function shuts down a device interface and removes it
d59b54b1 7088 * from the kernel tables.
44a0873d 7089 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
7090 *
7091 * Callers must hold the rtnl semaphore. You may want
7092 * unregister_netdev() instead of this.
7093 */
7094
44a0873d 7095void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 7096{
a6620712
HX
7097 ASSERT_RTNL();
7098
44a0873d 7099 if (head) {
9fdce099 7100 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
7101 } else {
7102 rollback_registered(dev);
7103 /* Finish processing unregister after unlock */
7104 net_set_todo(dev);
7105 }
1da177e4 7106}
44a0873d 7107EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 7108
9b5e383c
ED
7109/**
7110 * unregister_netdevice_many - unregister many devices
7111 * @head: list of devices
87757a91
ED
7112 *
7113 * Note: As most callers use a stack allocated list_head,
7114 * we force a list_del() to make sure stack wont be corrupted later.
9b5e383c
ED
7115 */
7116void unregister_netdevice_many(struct list_head *head)
7117{
7118 struct net_device *dev;
7119
7120 if (!list_empty(head)) {
7121 rollback_registered_many(head);
7122 list_for_each_entry(dev, head, unreg_list)
7123 net_set_todo(dev);
87757a91 7124 list_del(head);
9b5e383c
ED
7125 }
7126}
63c8099d 7127EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 7128
1da177e4
LT
7129/**
7130 * unregister_netdev - remove device from the kernel
7131 * @dev: device
7132 *
7133 * This function shuts down a device interface and removes it
d59b54b1 7134 * from the kernel tables.
1da177e4
LT
7135 *
7136 * This is just a wrapper for unregister_netdevice that takes
7137 * the rtnl semaphore. In general you want to use this and not
7138 * unregister_netdevice.
7139 */
7140void unregister_netdev(struct net_device *dev)
7141{
7142 rtnl_lock();
7143 unregister_netdevice(dev);
7144 rtnl_unlock();
7145}
1da177e4
LT
7146EXPORT_SYMBOL(unregister_netdev);
7147
ce286d32
EB
7148/**
7149 * dev_change_net_namespace - move device to different nethost namespace
7150 * @dev: device
7151 * @net: network namespace
7152 * @pat: If not NULL name pattern to try if the current device name
7153 * is already taken in the destination network namespace.
7154 *
7155 * This function shuts down a device interface and moves it
7156 * to a new network namespace. On success 0 is returned, on
7157 * a failure a netagive errno code is returned.
7158 *
7159 * Callers must hold the rtnl semaphore.
7160 */
7161
7162int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7163{
ce286d32
EB
7164 int err;
7165
7166 ASSERT_RTNL();
7167
7168 /* Don't allow namespace local devices to be moved. */
7169 err = -EINVAL;
7170 if (dev->features & NETIF_F_NETNS_LOCAL)
7171 goto out;
7172
7173 /* Ensure the device has been registrered */
ce286d32
EB
7174 if (dev->reg_state != NETREG_REGISTERED)
7175 goto out;
7176
7177 /* Get out if there is nothing todo */
7178 err = 0;
878628fb 7179 if (net_eq(dev_net(dev), net))
ce286d32
EB
7180 goto out;
7181
7182 /* Pick the destination device name, and ensure
7183 * we can use it in the destination network namespace.
7184 */
7185 err = -EEXIST;
d9031024 7186 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
7187 /* We get here if we can't use the current device name */
7188 if (!pat)
7189 goto out;
828de4f6 7190 if (dev_get_valid_name(net, dev, pat) < 0)
ce286d32
EB
7191 goto out;
7192 }
7193
7194 /*
7195 * And now a mini version of register_netdevice unregister_netdevice.
7196 */
7197
7198 /* If device is running close it first. */
9b772652 7199 dev_close(dev);
ce286d32
EB
7200
7201 /* And unlink it from device chain */
7202 err = -ENODEV;
7203 unlist_netdevice(dev);
7204
7205 synchronize_net();
7206
7207 /* Shutdown queueing discipline. */
7208 dev_shutdown(dev);
7209
7210 /* Notify protocols, that we are about to destroy
7211 this device. They should clean all the things.
3b27e105
DL
7212
7213 Note that dev->reg_state stays at NETREG_REGISTERED.
7214 This is wanted because this way 8021q and macvlan know
7215 the device is just moving and can keep their slaves up.
ce286d32
EB
7216 */
7217 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6549dd43
G
7218 rcu_barrier();
7219 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7f294054 7220 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
ce286d32
EB
7221
7222 /*
7223 * Flush the unicast and multicast chains
7224 */
a748ee24 7225 dev_uc_flush(dev);
22bedad3 7226 dev_mc_flush(dev);
ce286d32 7227
4e66ae2e
SH
7228 /* Send a netdev-removed uevent to the old namespace */
7229 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
4c75431a 7230 netdev_adjacent_del_links(dev);
4e66ae2e 7231
ce286d32 7232 /* Actually switch the network namespace */
c346dca1 7233 dev_net_set(dev, net);
ce286d32 7234
ce286d32 7235 /* If there is an ifindex conflict assign a new one */
7a66bbc9 7236 if (__dev_get_by_index(net, dev->ifindex))
ce286d32 7237 dev->ifindex = dev_new_index(net);
ce286d32 7238
4e66ae2e
SH
7239 /* Send a netdev-add uevent to the new namespace */
7240 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
4c75431a 7241 netdev_adjacent_add_links(dev);
4e66ae2e 7242
8b41d188 7243 /* Fixup kobjects */
a1b3f594 7244 err = device_rename(&dev->dev, dev->name);
8b41d188 7245 WARN_ON(err);
ce286d32
EB
7246
7247 /* Add the device back in the hashes */
7248 list_netdevice(dev);
7249
7250 /* Notify protocols, that a new device appeared. */
7251 call_netdevice_notifiers(NETDEV_REGISTER, dev);
7252
d90a909e
EB
7253 /*
7254 * Prevent userspace races by waiting until the network
7255 * device is fully setup before sending notifications.
7256 */
7f294054 7257 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
d90a909e 7258
ce286d32
EB
7259 synchronize_net();
7260 err = 0;
7261out:
7262 return err;
7263}
463d0183 7264EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 7265
1da177e4
LT
7266static int dev_cpu_callback(struct notifier_block *nfb,
7267 unsigned long action,
7268 void *ocpu)
7269{
7270 struct sk_buff **list_skb;
1da177e4
LT
7271 struct sk_buff *skb;
7272 unsigned int cpu, oldcpu = (unsigned long)ocpu;
7273 struct softnet_data *sd, *oldsd;
7274
8bb78442 7275 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
7276 return NOTIFY_OK;
7277
7278 local_irq_disable();
7279 cpu = smp_processor_id();
7280 sd = &per_cpu(softnet_data, cpu);
7281 oldsd = &per_cpu(softnet_data, oldcpu);
7282
7283 /* Find end of our completion_queue. */
7284 list_skb = &sd->completion_queue;
7285 while (*list_skb)
7286 list_skb = &(*list_skb)->next;
7287 /* Append completion queue from offline CPU. */
7288 *list_skb = oldsd->completion_queue;
7289 oldsd->completion_queue = NULL;
7290
1da177e4 7291 /* Append output queue from offline CPU. */
a9cbd588
CG
7292 if (oldsd->output_queue) {
7293 *sd->output_queue_tailp = oldsd->output_queue;
7294 sd->output_queue_tailp = oldsd->output_queue_tailp;
7295 oldsd->output_queue = NULL;
7296 oldsd->output_queue_tailp = &oldsd->output_queue;
7297 }
ac64da0b
ED
7298 /* Append NAPI poll list from offline CPU, with one exception :
7299 * process_backlog() must be called by cpu owning percpu backlog.
7300 * We properly handle process_queue & input_pkt_queue later.
7301 */
7302 while (!list_empty(&oldsd->poll_list)) {
7303 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7304 struct napi_struct,
7305 poll_list);
7306
7307 list_del_init(&napi->poll_list);
7308 if (napi->poll == process_backlog)
7309 napi->state = 0;
7310 else
7311 ____napi_schedule(sd, napi);
264524d5 7312 }
1da177e4
LT
7313
7314 raise_softirq_irqoff(NET_TX_SOFTIRQ);
7315 local_irq_enable();
7316
7317 /* Process offline CPU's input_pkt_queue */
76cc8b13 7318 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
91e83133 7319 netif_rx_ni(skb);
76cc8b13 7320 input_queue_head_incr(oldsd);
fec5e652 7321 }
ac64da0b 7322 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
91e83133 7323 netif_rx_ni(skb);
76cc8b13
TH
7324 input_queue_head_incr(oldsd);
7325 }
1da177e4
LT
7326
7327 return NOTIFY_OK;
7328}
1da177e4
LT
7329
7330
7f353bf2 7331/**
b63365a2
HX
7332 * netdev_increment_features - increment feature set by one
7333 * @all: current feature set
7334 * @one: new feature set
7335 * @mask: mask feature set
7f353bf2
HX
7336 *
7337 * Computes a new feature set after adding a device with feature set
b63365a2
HX
7338 * @one to the master device with current feature set @all. Will not
7339 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 7340 */
c8f44aff
MM
7341netdev_features_t netdev_increment_features(netdev_features_t all,
7342 netdev_features_t one, netdev_features_t mask)
b63365a2 7343{
1742f183
MM
7344 if (mask & NETIF_F_GEN_CSUM)
7345 mask |= NETIF_F_ALL_CSUM;
7346 mask |= NETIF_F_VLAN_CHALLENGED;
7f353bf2 7347
1742f183
MM
7348 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7349 all &= one | ~NETIF_F_ALL_FOR_ALL;
c6e1a0d1 7350
1742f183
MM
7351 /* If one device supports hw checksumming, set for all. */
7352 if (all & NETIF_F_GEN_CSUM)
7353 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7f353bf2
HX
7354
7355 return all;
7356}
b63365a2 7357EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 7358
430f03cd 7359static struct hlist_head * __net_init netdev_create_hash(void)
30d97d35
PE
7360{
7361 int i;
7362 struct hlist_head *hash;
7363
7364 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7365 if (hash != NULL)
7366 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7367 INIT_HLIST_HEAD(&hash[i]);
7368
7369 return hash;
7370}
7371
881d966b 7372/* Initialize per network namespace state */
4665079c 7373static int __net_init netdev_init(struct net *net)
881d966b 7374{
734b6541
RM
7375 if (net != &init_net)
7376 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 7377
30d97d35
PE
7378 net->dev_name_head = netdev_create_hash();
7379 if (net->dev_name_head == NULL)
7380 goto err_name;
881d966b 7381
30d97d35
PE
7382 net->dev_index_head = netdev_create_hash();
7383 if (net->dev_index_head == NULL)
7384 goto err_idx;
881d966b
EB
7385
7386 return 0;
30d97d35
PE
7387
7388err_idx:
7389 kfree(net->dev_name_head);
7390err_name:
7391 return -ENOMEM;
881d966b
EB
7392}
7393
f0db275a
SH
7394/**
7395 * netdev_drivername - network driver for the device
7396 * @dev: network device
f0db275a
SH
7397 *
7398 * Determine network driver for device.
7399 */
3019de12 7400const char *netdev_drivername(const struct net_device *dev)
6579e57b 7401{
cf04a4c7
SH
7402 const struct device_driver *driver;
7403 const struct device *parent;
3019de12 7404 const char *empty = "";
6579e57b
AV
7405
7406 parent = dev->dev.parent;
6579e57b 7407 if (!parent)
3019de12 7408 return empty;
6579e57b
AV
7409
7410 driver = parent->driver;
7411 if (driver && driver->name)
3019de12
DM
7412 return driver->name;
7413 return empty;
6579e57b
AV
7414}
7415
6ea754eb
JP
7416static void __netdev_printk(const char *level, const struct net_device *dev,
7417 struct va_format *vaf)
256df2f3 7418{
b004ff49 7419 if (dev && dev->dev.parent) {
6ea754eb
JP
7420 dev_printk_emit(level[1] - '0',
7421 dev->dev.parent,
7422 "%s %s %s%s: %pV",
7423 dev_driver_string(dev->dev.parent),
7424 dev_name(dev->dev.parent),
7425 netdev_name(dev), netdev_reg_state(dev),
7426 vaf);
b004ff49 7427 } else if (dev) {
6ea754eb
JP
7428 printk("%s%s%s: %pV",
7429 level, netdev_name(dev), netdev_reg_state(dev), vaf);
b004ff49 7430 } else {
6ea754eb 7431 printk("%s(NULL net_device): %pV", level, vaf);
b004ff49 7432 }
256df2f3
JP
7433}
7434
6ea754eb
JP
7435void netdev_printk(const char *level, const struct net_device *dev,
7436 const char *format, ...)
256df2f3
JP
7437{
7438 struct va_format vaf;
7439 va_list args;
256df2f3
JP
7440
7441 va_start(args, format);
7442
7443 vaf.fmt = format;
7444 vaf.va = &args;
7445
6ea754eb 7446 __netdev_printk(level, dev, &vaf);
b004ff49 7447
256df2f3 7448 va_end(args);
256df2f3
JP
7449}
7450EXPORT_SYMBOL(netdev_printk);
7451
7452#define define_netdev_printk_level(func, level) \
6ea754eb 7453void func(const struct net_device *dev, const char *fmt, ...) \
256df2f3 7454{ \
256df2f3
JP
7455 struct va_format vaf; \
7456 va_list args; \
7457 \
7458 va_start(args, fmt); \
7459 \
7460 vaf.fmt = fmt; \
7461 vaf.va = &args; \
7462 \
6ea754eb 7463 __netdev_printk(level, dev, &vaf); \
b004ff49 7464 \
256df2f3 7465 va_end(args); \
256df2f3
JP
7466} \
7467EXPORT_SYMBOL(func);
7468
7469define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7470define_netdev_printk_level(netdev_alert, KERN_ALERT);
7471define_netdev_printk_level(netdev_crit, KERN_CRIT);
7472define_netdev_printk_level(netdev_err, KERN_ERR);
7473define_netdev_printk_level(netdev_warn, KERN_WARNING);
7474define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7475define_netdev_printk_level(netdev_info, KERN_INFO);
7476
4665079c 7477static void __net_exit netdev_exit(struct net *net)
881d966b
EB
7478{
7479 kfree(net->dev_name_head);
7480 kfree(net->dev_index_head);
7481}
7482
022cbae6 7483static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
7484 .init = netdev_init,
7485 .exit = netdev_exit,
7486};
7487
4665079c 7488static void __net_exit default_device_exit(struct net *net)
ce286d32 7489{
e008b5fc 7490 struct net_device *dev, *aux;
ce286d32 7491 /*
e008b5fc 7492 * Push all migratable network devices back to the
ce286d32
EB
7493 * initial network namespace
7494 */
7495 rtnl_lock();
e008b5fc 7496 for_each_netdev_safe(net, dev, aux) {
ce286d32 7497 int err;
aca51397 7498 char fb_name[IFNAMSIZ];
ce286d32
EB
7499
7500 /* Ignore unmoveable devices (i.e. loopback) */
7501 if (dev->features & NETIF_F_NETNS_LOCAL)
7502 continue;
7503
e008b5fc
EB
7504 /* Leave virtual devices for the generic cleanup */
7505 if (dev->rtnl_link_ops)
7506 continue;
d0c082ce 7507
25985edc 7508 /* Push remaining network devices to init_net */
aca51397
PE
7509 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7510 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 7511 if (err) {
7b6cd1ce
JP
7512 pr_emerg("%s: failed to move %s to init_net: %d\n",
7513 __func__, dev->name, err);
aca51397 7514 BUG();
ce286d32
EB
7515 }
7516 }
7517 rtnl_unlock();
7518}
7519
50624c93
EB
7520static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7521{
7522 /* Return with the rtnl_lock held when there are no network
7523 * devices unregistering in any network namespace in net_list.
7524 */
7525 struct net *net;
7526 bool unregistering;
ff960a73 7527 DEFINE_WAIT_FUNC(wait, woken_wake_function);
50624c93 7528
ff960a73 7529 add_wait_queue(&netdev_unregistering_wq, &wait);
50624c93 7530 for (;;) {
50624c93
EB
7531 unregistering = false;
7532 rtnl_lock();
7533 list_for_each_entry(net, net_list, exit_list) {
7534 if (net->dev_unreg_count > 0) {
7535 unregistering = true;
7536 break;
7537 }
7538 }
7539 if (!unregistering)
7540 break;
7541 __rtnl_unlock();
ff960a73
PZ
7542
7543 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
50624c93 7544 }
ff960a73 7545 remove_wait_queue(&netdev_unregistering_wq, &wait);
50624c93
EB
7546}
7547
04dc7f6b
EB
7548static void __net_exit default_device_exit_batch(struct list_head *net_list)
7549{
7550 /* At exit all network devices most be removed from a network
b595076a 7551 * namespace. Do this in the reverse order of registration.
04dc7f6b
EB
7552 * Do this across as many network namespaces as possible to
7553 * improve batching efficiency.
7554 */
7555 struct net_device *dev;
7556 struct net *net;
7557 LIST_HEAD(dev_kill_list);
7558
50624c93
EB
7559 /* To prevent network device cleanup code from dereferencing
7560 * loopback devices or network devices that have been freed
7561 * wait here for all pending unregistrations to complete,
7562 * before unregistring the loopback device and allowing the
7563 * network namespace be freed.
7564 *
7565 * The netdev todo list containing all network devices
7566 * unregistrations that happen in default_device_exit_batch
7567 * will run in the rtnl_unlock() at the end of
7568 * default_device_exit_batch.
7569 */
7570 rtnl_lock_unregistering(net_list);
04dc7f6b
EB
7571 list_for_each_entry(net, net_list, exit_list) {
7572 for_each_netdev_reverse(net, dev) {
b0ab2fab 7573 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
04dc7f6b
EB
7574 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7575 else
7576 unregister_netdevice_queue(dev, &dev_kill_list);
7577 }
7578 }
7579 unregister_netdevice_many(&dev_kill_list);
7580 rtnl_unlock();
7581}
7582
022cbae6 7583static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 7584 .exit = default_device_exit,
04dc7f6b 7585 .exit_batch = default_device_exit_batch,
ce286d32
EB
7586};
7587
1da177e4
LT
7588/*
7589 * Initialize the DEV module. At boot time this walks the device list and
7590 * unhooks any devices that fail to initialise (normally hardware not
7591 * present) and leaves us with a valid list of present and active devices.
7592 *
7593 */
7594
7595/*
7596 * This is called single threaded during boot, so no need
7597 * to take the rtnl semaphore.
7598 */
7599static int __init net_dev_init(void)
7600{
7601 int i, rc = -ENOMEM;
7602
7603 BUG_ON(!dev_boot_phase);
7604
1da177e4
LT
7605 if (dev_proc_init())
7606 goto out;
7607
8b41d188 7608 if (netdev_kobject_init())
1da177e4
LT
7609 goto out;
7610
7611 INIT_LIST_HEAD(&ptype_all);
82d8a867 7612 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
7613 INIT_LIST_HEAD(&ptype_base[i]);
7614
62532da9
VY
7615 INIT_LIST_HEAD(&offload_base);
7616
881d966b
EB
7617 if (register_pernet_subsys(&netdev_net_ops))
7618 goto out;
1da177e4
LT
7619
7620 /*
7621 * Initialise the packet receive queues.
7622 */
7623
6f912042 7624 for_each_possible_cpu(i) {
e36fa2f7 7625 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 7626
e36fa2f7 7627 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 7628 skb_queue_head_init(&sd->process_queue);
e36fa2f7 7629 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588 7630 sd->output_queue_tailp = &sd->output_queue;
df334545 7631#ifdef CONFIG_RPS
e36fa2f7
ED
7632 sd->csd.func = rps_trigger_softirq;
7633 sd->csd.info = sd;
e36fa2f7 7634 sd->cpu = i;
1e94d72f 7635#endif
0a9627f2 7636
e36fa2f7
ED
7637 sd->backlog.poll = process_backlog;
7638 sd->backlog.weight = weight_p;
1da177e4
LT
7639 }
7640
1da177e4
LT
7641 dev_boot_phase = 0;
7642
505d4f73
EB
7643 /* The loopback device is special if any other network devices
7644 * is present in a network namespace the loopback device must
7645 * be present. Since we now dynamically allocate and free the
7646 * loopback device ensure this invariant is maintained by
7647 * keeping the loopback device as the first device on the
7648 * list of network devices. Ensuring the loopback devices
7649 * is the first device that appears and the last network device
7650 * that disappears.
7651 */
7652 if (register_pernet_device(&loopback_net_ops))
7653 goto out;
7654
7655 if (register_pernet_device(&default_device_ops))
7656 goto out;
7657
962cf36c
CM
7658 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7659 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
7660
7661 hotcpu_notifier(dev_cpu_callback, 0);
7662 dst_init();
1da177e4
LT
7663 rc = 0;
7664out:
7665 return rc;
7666}
7667
7668subsys_initcall(net_dev_init);
This page took 1.81537 seconds and 5 git commands to generate.