net: sweep-up some straglers in strlcpy conversion of .get_drvinfo routines
[deliverable/linux.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
4fc268d2 78#include <linux/capability.h>
1da177e4
LT
79#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
08e9897d 82#include <linux/hash.h>
5a0e3ad6 83#include <linux/slab.h>
1da177e4 84#include <linux/sched.h>
4a3e2f71 85#include <linux/mutex.h>
1da177e4
LT
86#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
0187bdfb 95#include <linux/ethtool.h>
1da177e4
LT
96#include <linux/notifier.h>
97#include <linux/skbuff.h>
457c4cbc 98#include <net/net_namespace.h>
1da177e4
LT
99#include <net/sock.h>
100#include <linux/rtnetlink.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/stat.h>
1da177e4
LT
104#include <net/dst.h>
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
44540960 107#include <net/xfrm.h>
1da177e4
LT
108#include <linux/highmem.h>
109#include <linux/init.h>
110#include <linux/kmod.h>
111#include <linux/module.h>
1da177e4
LT
112#include <linux/netpoll.h>
113#include <linux/rcupdate.h>
114#include <linux/delay.h>
295f4a1f 115#include <net/wext.h>
1da177e4 116#include <net/iw_handler.h>
1da177e4 117#include <asm/current.h>
5bdb9886 118#include <linux/audit.h>
db217334 119#include <linux/dmaengine.h>
f6a78bfc 120#include <linux/err.h>
c7fa9d18 121#include <linux/ctype.h>
723e98b7 122#include <linux/if_arp.h>
6de329e2 123#include <linux/if_vlan.h>
8f0f2223 124#include <linux/ip.h>
ad55dcaf 125#include <net/ip.h>
8f0f2223
DM
126#include <linux/ipv6.h>
127#include <linux/in.h>
b6b2fed1
DM
128#include <linux/jhash.h>
129#include <linux/random.h>
9cbc1cb8 130#include <trace/events/napi.h>
cf66ba58 131#include <trace/events/net.h>
07dc22e7 132#include <trace/events/skb.h>
5acbbd42 133#include <linux/pci.h>
caeda9b9 134#include <linux/inetdevice.h>
c445477d 135#include <linux/cpu_rmap.h>
e971b722 136#include <linux/if_tunnel.h>
ae1511bf 137#include <linux/if_pppox.h>
5dd17e08 138#include <linux/ppp_defs.h>
4dc360c5 139#include <linux/net_tstamp.h>
588f0330 140#include <linux/jump_label.h>
1da177e4 141
342709ef
PE
142#include "net-sysfs.h"
143
d565b0a1
HX
144/* Instead of increasing this, you should create a hash table. */
145#define MAX_GRO_SKBS 8
146
5d38a079
HX
147/* This should be increased if a protocol with a bigger head is added. */
148#define GRO_MAX_HEAD (MAX_HEADER + 128)
149
1da177e4
LT
150/*
151 * The list of packet types we will receive (as opposed to discard)
152 * and the routines to invoke.
153 *
154 * Why 16. Because with 16 the only overlap we get on a hash of the
155 * low nibble of the protocol value is RARP/SNAP/X.25.
156 *
157 * NOTE: That is no longer true with the addition of VLAN tags. Not
158 * sure which should go first, but I bet it won't make much
159 * difference if we are running VLANs. The good news is that
160 * this protocol won't be in the list unless compiled in, so
3041a069 161 * the average user (w/out VLANs) will not be adversely affected.
1da177e4
LT
162 * --BLG
163 *
164 * 0800 IP
165 * 8100 802.1Q VLAN
166 * 0001 802.3
167 * 0002 AX.25
168 * 0004 802.2
169 * 8035 RARP
170 * 0005 SNAP
171 * 0805 X.25
172 * 0806 ARP
173 * 8137 IPX
174 * 0009 Localtalk
175 * 86DD IPv6
176 */
177
82d8a867
PE
178#define PTYPE_HASH_SIZE (16)
179#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
180
1da177e4 181static DEFINE_SPINLOCK(ptype_lock);
82d8a867 182static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
6b2bedc3 183static struct list_head ptype_all __read_mostly; /* Taps */
1da177e4 184
1da177e4 185/*
7562f876 186 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
187 * semaphore.
188 *
c6d14c84 189 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
190 *
191 * Writers must hold the rtnl semaphore while they loop through the
7562f876 192 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
193 * actual updates. This allows pure readers to access the list even
194 * while a writer is preparing to update it.
195 *
196 * To put it another way, dev_base_lock is held for writing only to
197 * protect against pure readers; the rtnl semaphore provides the
198 * protection against other writers.
199 *
200 * See, for example usages, register_netdevice() and
201 * unregister_netdevice(), which must be called with the rtnl
202 * semaphore held.
203 */
1da177e4 204DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
205EXPORT_SYMBOL(dev_base_lock);
206
4e985ada
TG
207static inline void dev_base_seq_inc(struct net *net)
208{
209 while (++net->dev_base_seq == 0);
210}
211
881d966b 212static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4
LT
213{
214 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
08e9897d 215 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
216}
217
881d966b 218static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 219{
7c28bd0b 220 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
221}
222
e36fa2f7 223static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
224{
225#ifdef CONFIG_RPS
e36fa2f7 226 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
227#endif
228}
229
e36fa2f7 230static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
231{
232#ifdef CONFIG_RPS
e36fa2f7 233 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
234#endif
235}
236
ce286d32
EB
237/* Device list insertion */
238static int list_netdevice(struct net_device *dev)
239{
c346dca1 240 struct net *net = dev_net(dev);
ce286d32
EB
241
242 ASSERT_RTNL();
243
244 write_lock_bh(&dev_base_lock);
c6d14c84 245 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 246 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
247 hlist_add_head_rcu(&dev->index_hlist,
248 dev_index_hash(net, dev->ifindex));
ce286d32 249 write_unlock_bh(&dev_base_lock);
4e985ada
TG
250
251 dev_base_seq_inc(net);
252
ce286d32
EB
253 return 0;
254}
255
fb699dfd
ED
256/* Device list removal
257 * caller must respect a RCU grace period before freeing/reusing dev
258 */
ce286d32
EB
259static void unlist_netdevice(struct net_device *dev)
260{
261 ASSERT_RTNL();
262
263 /* Unlink dev from the device chain */
264 write_lock_bh(&dev_base_lock);
c6d14c84 265 list_del_rcu(&dev->dev_list);
72c9528b 266 hlist_del_rcu(&dev->name_hlist);
fb699dfd 267 hlist_del_rcu(&dev->index_hlist);
ce286d32 268 write_unlock_bh(&dev_base_lock);
4e985ada
TG
269
270 dev_base_seq_inc(dev_net(dev));
ce286d32
EB
271}
272
1da177e4
LT
273/*
274 * Our notifier list
275 */
276
f07d5b94 277static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
278
279/*
280 * Device drivers call our routines to queue packets here. We empty the
281 * queue in the local softnet handler.
282 */
bea3348e 283
9958da05 284DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 285EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 286
cf508b12 287#ifdef CONFIG_LOCKDEP
723e98b7 288/*
c773e847 289 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
290 * according to dev->type
291 */
292static const unsigned short netdev_lock_type[] =
293 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
294 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
295 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
296 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
297 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
298 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
299 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
300 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
301 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
302 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
303 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
304 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
305 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
2d91d78b 306 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
929122cd 307 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
fcb94e42 308 ARPHRD_VOID, ARPHRD_NONE};
723e98b7 309
36cbd3dc 310static const char *const netdev_lock_name[] =
723e98b7
JP
311 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
312 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
313 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
314 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
315 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
316 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
317 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
318 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
319 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
320 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
321 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
322 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
323 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
2d91d78b 324 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
929122cd 325 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
fcb94e42 326 "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
327
328static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 329static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
330
331static inline unsigned short netdev_lock_pos(unsigned short dev_type)
332{
333 int i;
334
335 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
336 if (netdev_lock_type[i] == dev_type)
337 return i;
338 /* the last key is used by default */
339 return ARRAY_SIZE(netdev_lock_type) - 1;
340}
341
cf508b12
DM
342static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
343 unsigned short dev_type)
723e98b7
JP
344{
345 int i;
346
347 i = netdev_lock_pos(dev_type);
348 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
349 netdev_lock_name[i]);
350}
cf508b12
DM
351
352static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
353{
354 int i;
355
356 i = netdev_lock_pos(dev->type);
357 lockdep_set_class_and_name(&dev->addr_list_lock,
358 &netdev_addr_lock_key[i],
359 netdev_lock_name[i]);
360}
723e98b7 361#else
cf508b12
DM
362static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
363 unsigned short dev_type)
364{
365}
366static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
367{
368}
369#endif
1da177e4
LT
370
371/*******************************************************************************
372
373 Protocol management and registration routines
374
375*******************************************************************************/
376
1da177e4
LT
377/*
378 * Add a protocol ID to the list. Now that the input handler is
379 * smarter we can dispense with all the messy stuff that used to be
380 * here.
381 *
382 * BEWARE!!! Protocol handlers, mangling input packets,
383 * MUST BE last in hash buckets and checking protocol handlers
384 * MUST start from promiscuous ptype_all chain in net_bh.
385 * It is true now, do not change it.
386 * Explanation follows: if protocol handler, mangling packet, will
387 * be the first on list, it is not able to sense, that packet
388 * is cloned and should be copied-on-write, so that it will
389 * change it and subsequent readers will get broken packet.
390 * --ANK (980803)
391 */
392
c07b68e8
ED
393static inline struct list_head *ptype_head(const struct packet_type *pt)
394{
395 if (pt->type == htons(ETH_P_ALL))
396 return &ptype_all;
397 else
398 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
399}
400
1da177e4
LT
401/**
402 * dev_add_pack - add packet handler
403 * @pt: packet type declaration
404 *
405 * Add a protocol handler to the networking stack. The passed &packet_type
406 * is linked into kernel lists and may not be freed until it has been
407 * removed from the kernel lists.
408 *
4ec93edb 409 * This call does not sleep therefore it can not
1da177e4
LT
410 * guarantee all CPU's that are in middle of receiving packets
411 * will see the new packet type (until the next received packet).
412 */
413
414void dev_add_pack(struct packet_type *pt)
415{
c07b68e8 416 struct list_head *head = ptype_head(pt);
1da177e4 417
c07b68e8
ED
418 spin_lock(&ptype_lock);
419 list_add_rcu(&pt->list, head);
420 spin_unlock(&ptype_lock);
1da177e4 421}
d1b19dff 422EXPORT_SYMBOL(dev_add_pack);
1da177e4 423
1da177e4
LT
424/**
425 * __dev_remove_pack - remove packet handler
426 * @pt: packet type declaration
427 *
428 * Remove a protocol handler that was previously added to the kernel
429 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
430 * from the kernel lists and can be freed or reused once this function
4ec93edb 431 * returns.
1da177e4
LT
432 *
433 * The packet type might still be in use by receivers
434 * and must not be freed until after all the CPU's have gone
435 * through a quiescent state.
436 */
437void __dev_remove_pack(struct packet_type *pt)
438{
c07b68e8 439 struct list_head *head = ptype_head(pt);
1da177e4
LT
440 struct packet_type *pt1;
441
c07b68e8 442 spin_lock(&ptype_lock);
1da177e4
LT
443
444 list_for_each_entry(pt1, head, list) {
445 if (pt == pt1) {
446 list_del_rcu(&pt->list);
447 goto out;
448 }
449 }
450
451 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
452out:
c07b68e8 453 spin_unlock(&ptype_lock);
1da177e4 454}
d1b19dff
ED
455EXPORT_SYMBOL(__dev_remove_pack);
456
1da177e4
LT
457/**
458 * dev_remove_pack - remove packet handler
459 * @pt: packet type declaration
460 *
461 * Remove a protocol handler that was previously added to the kernel
462 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
463 * from the kernel lists and can be freed or reused once this function
464 * returns.
465 *
466 * This call sleeps to guarantee that no CPU is looking at the packet
467 * type after return.
468 */
469void dev_remove_pack(struct packet_type *pt)
470{
471 __dev_remove_pack(pt);
4ec93edb 472
1da177e4
LT
473 synchronize_net();
474}
d1b19dff 475EXPORT_SYMBOL(dev_remove_pack);
1da177e4
LT
476
477/******************************************************************************
478
479 Device Boot-time Settings Routines
480
481*******************************************************************************/
482
483/* Boot time configuration table */
484static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
485
486/**
487 * netdev_boot_setup_add - add new setup entry
488 * @name: name of the device
489 * @map: configured settings for the device
490 *
491 * Adds new setup entry to the dev_boot_setup list. The function
492 * returns 0 on error and 1 on success. This is a generic routine to
493 * all netdevices.
494 */
495static int netdev_boot_setup_add(char *name, struct ifmap *map)
496{
497 struct netdev_boot_setup *s;
498 int i;
499
500 s = dev_boot_setup;
501 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
502 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
503 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 504 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
505 memcpy(&s[i].map, map, sizeof(s[i].map));
506 break;
507 }
508 }
509
510 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
511}
512
513/**
514 * netdev_boot_setup_check - check boot time settings
515 * @dev: the netdevice
516 *
517 * Check boot time settings for the device.
518 * The found settings are set for the device to be used
519 * later in the device probing.
520 * Returns 0 if no settings found, 1 if they are.
521 */
522int netdev_boot_setup_check(struct net_device *dev)
523{
524 struct netdev_boot_setup *s = dev_boot_setup;
525 int i;
526
527 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
528 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 529 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
530 dev->irq = s[i].map.irq;
531 dev->base_addr = s[i].map.base_addr;
532 dev->mem_start = s[i].map.mem_start;
533 dev->mem_end = s[i].map.mem_end;
534 return 1;
535 }
536 }
537 return 0;
538}
d1b19dff 539EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
540
541
542/**
543 * netdev_boot_base - get address from boot time settings
544 * @prefix: prefix for network device
545 * @unit: id for network device
546 *
547 * Check boot time settings for the base address of device.
548 * The found settings are set for the device to be used
549 * later in the device probing.
550 * Returns 0 if no settings found.
551 */
552unsigned long netdev_boot_base(const char *prefix, int unit)
553{
554 const struct netdev_boot_setup *s = dev_boot_setup;
555 char name[IFNAMSIZ];
556 int i;
557
558 sprintf(name, "%s%d", prefix, unit);
559
560 /*
561 * If device already registered then return base of 1
562 * to indicate not to probe for this interface
563 */
881d966b 564 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
565 return 1;
566
567 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
568 if (!strcmp(name, s[i].name))
569 return s[i].map.base_addr;
570 return 0;
571}
572
573/*
574 * Saves at boot time configured settings for any netdevice.
575 */
576int __init netdev_boot_setup(char *str)
577{
578 int ints[5];
579 struct ifmap map;
580
581 str = get_options(str, ARRAY_SIZE(ints), ints);
582 if (!str || !*str)
583 return 0;
584
585 /* Save settings */
586 memset(&map, 0, sizeof(map));
587 if (ints[0] > 0)
588 map.irq = ints[1];
589 if (ints[0] > 1)
590 map.base_addr = ints[2];
591 if (ints[0] > 2)
592 map.mem_start = ints[3];
593 if (ints[0] > 3)
594 map.mem_end = ints[4];
595
596 /* Add new entry to the list */
597 return netdev_boot_setup_add(str, &map);
598}
599
600__setup("netdev=", netdev_boot_setup);
601
602/*******************************************************************************
603
604 Device Interface Subroutines
605
606*******************************************************************************/
607
608/**
609 * __dev_get_by_name - find a device by its name
c4ea43c5 610 * @net: the applicable net namespace
1da177e4
LT
611 * @name: name to find
612 *
613 * Find an interface by name. Must be called under RTNL semaphore
614 * or @dev_base_lock. If the name is found a pointer to the device
615 * is returned. If the name is not found then %NULL is returned. The
616 * reference counters are not incremented so the caller must be
617 * careful with locks.
618 */
619
881d966b 620struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
621{
622 struct hlist_node *p;
0bd8d536
ED
623 struct net_device *dev;
624 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 625
0bd8d536 626 hlist_for_each_entry(dev, p, head, name_hlist)
1da177e4
LT
627 if (!strncmp(dev->name, name, IFNAMSIZ))
628 return dev;
0bd8d536 629
1da177e4
LT
630 return NULL;
631}
d1b19dff 632EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 633
72c9528b
ED
634/**
635 * dev_get_by_name_rcu - find a device by its name
636 * @net: the applicable net namespace
637 * @name: name to find
638 *
639 * Find an interface by name.
640 * If the name is found a pointer to the device is returned.
641 * If the name is not found then %NULL is returned.
642 * The reference counters are not incremented so the caller must be
643 * careful with locks. The caller must hold RCU lock.
644 */
645
646struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
647{
648 struct hlist_node *p;
649 struct net_device *dev;
650 struct hlist_head *head = dev_name_hash(net, name);
651
652 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
653 if (!strncmp(dev->name, name, IFNAMSIZ))
654 return dev;
655
656 return NULL;
657}
658EXPORT_SYMBOL(dev_get_by_name_rcu);
659
1da177e4
LT
660/**
661 * dev_get_by_name - find a device by its name
c4ea43c5 662 * @net: the applicable net namespace
1da177e4
LT
663 * @name: name to find
664 *
665 * Find an interface by name. This can be called from any
666 * context and does its own locking. The returned handle has
667 * the usage count incremented and the caller must use dev_put() to
668 * release it when it is no longer needed. %NULL is returned if no
669 * matching device is found.
670 */
671
881d966b 672struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
673{
674 struct net_device *dev;
675
72c9528b
ED
676 rcu_read_lock();
677 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
678 if (dev)
679 dev_hold(dev);
72c9528b 680 rcu_read_unlock();
1da177e4
LT
681 return dev;
682}
d1b19dff 683EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
684
685/**
686 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 687 * @net: the applicable net namespace
1da177e4
LT
688 * @ifindex: index of device
689 *
690 * Search for an interface by index. Returns %NULL if the device
691 * is not found or a pointer to the device. The device has not
692 * had its reference counter increased so the caller must be careful
693 * about locking. The caller must hold either the RTNL semaphore
694 * or @dev_base_lock.
695 */
696
881d966b 697struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
698{
699 struct hlist_node *p;
0bd8d536
ED
700 struct net_device *dev;
701 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 702
0bd8d536 703 hlist_for_each_entry(dev, p, head, index_hlist)
1da177e4
LT
704 if (dev->ifindex == ifindex)
705 return dev;
0bd8d536 706
1da177e4
LT
707 return NULL;
708}
d1b19dff 709EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 710
fb699dfd
ED
711/**
712 * dev_get_by_index_rcu - find a device by its ifindex
713 * @net: the applicable net namespace
714 * @ifindex: index of device
715 *
716 * Search for an interface by index. Returns %NULL if the device
717 * is not found or a pointer to the device. The device has not
718 * had its reference counter increased so the caller must be careful
719 * about locking. The caller must hold RCU lock.
720 */
721
722struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
723{
724 struct hlist_node *p;
725 struct net_device *dev;
726 struct hlist_head *head = dev_index_hash(net, ifindex);
727
728 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
729 if (dev->ifindex == ifindex)
730 return dev;
731
732 return NULL;
733}
734EXPORT_SYMBOL(dev_get_by_index_rcu);
735
1da177e4
LT
736
737/**
738 * dev_get_by_index - find a device by its ifindex
c4ea43c5 739 * @net: the applicable net namespace
1da177e4
LT
740 * @ifindex: index of device
741 *
742 * Search for an interface by index. Returns NULL if the device
743 * is not found or a pointer to the device. The device returned has
744 * had a reference added and the pointer is safe until the user calls
745 * dev_put to indicate they have finished with it.
746 */
747
881d966b 748struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
749{
750 struct net_device *dev;
751
fb699dfd
ED
752 rcu_read_lock();
753 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
754 if (dev)
755 dev_hold(dev);
fb699dfd 756 rcu_read_unlock();
1da177e4
LT
757 return dev;
758}
d1b19dff 759EXPORT_SYMBOL(dev_get_by_index);
1da177e4
LT
760
761/**
941666c2 762 * dev_getbyhwaddr_rcu - find a device by its hardware address
c4ea43c5 763 * @net: the applicable net namespace
1da177e4
LT
764 * @type: media type of device
765 * @ha: hardware address
766 *
767 * Search for an interface by MAC address. Returns NULL if the device
c506653d
ED
768 * is not found or a pointer to the device.
769 * The caller must hold RCU or RTNL.
941666c2 770 * The returned device has not had its ref count increased
1da177e4
LT
771 * and the caller must therefore be careful about locking
772 *
1da177e4
LT
773 */
774
941666c2
ED
775struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
776 const char *ha)
1da177e4
LT
777{
778 struct net_device *dev;
779
941666c2 780 for_each_netdev_rcu(net, dev)
1da177e4
LT
781 if (dev->type == type &&
782 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
783 return dev;
784
785 return NULL;
1da177e4 786}
941666c2 787EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
cf309e3f 788
881d966b 789struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
790{
791 struct net_device *dev;
792
4e9cac2b 793 ASSERT_RTNL();
881d966b 794 for_each_netdev(net, dev)
4e9cac2b 795 if (dev->type == type)
7562f876
PE
796 return dev;
797
798 return NULL;
4e9cac2b 799}
4e9cac2b
PM
800EXPORT_SYMBOL(__dev_getfirstbyhwtype);
801
881d966b 802struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 803{
99fe3c39 804 struct net_device *dev, *ret = NULL;
4e9cac2b 805
99fe3c39
ED
806 rcu_read_lock();
807 for_each_netdev_rcu(net, dev)
808 if (dev->type == type) {
809 dev_hold(dev);
810 ret = dev;
811 break;
812 }
813 rcu_read_unlock();
814 return ret;
1da177e4 815}
1da177e4
LT
816EXPORT_SYMBOL(dev_getfirstbyhwtype);
817
818/**
bb69ae04 819 * dev_get_by_flags_rcu - find any device with given flags
c4ea43c5 820 * @net: the applicable net namespace
1da177e4
LT
821 * @if_flags: IFF_* values
822 * @mask: bitmask of bits in if_flags to check
823 *
824 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04
ED
825 * is not found or a pointer to the device. Must be called inside
826 * rcu_read_lock(), and result refcount is unchanged.
1da177e4
LT
827 */
828
bb69ae04 829struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
d1b19dff 830 unsigned short mask)
1da177e4 831{
7562f876 832 struct net_device *dev, *ret;
1da177e4 833
7562f876 834 ret = NULL;
c6d14c84 835 for_each_netdev_rcu(net, dev) {
1da177e4 836 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 837 ret = dev;
1da177e4
LT
838 break;
839 }
840 }
7562f876 841 return ret;
1da177e4 842}
bb69ae04 843EXPORT_SYMBOL(dev_get_by_flags_rcu);
1da177e4
LT
844
845/**
846 * dev_valid_name - check if name is okay for network device
847 * @name: name string
848 *
849 * Network device names need to be valid file names to
c7fa9d18
DM
850 * to allow sysfs to work. We also disallow any kind of
851 * whitespace.
1da177e4 852 */
c2373ee9 853int dev_valid_name(const char *name)
1da177e4 854{
c7fa9d18
DM
855 if (*name == '\0')
856 return 0;
b6fe17d6
SH
857 if (strlen(name) >= IFNAMSIZ)
858 return 0;
c7fa9d18
DM
859 if (!strcmp(name, ".") || !strcmp(name, ".."))
860 return 0;
861
862 while (*name) {
863 if (*name == '/' || isspace(*name))
864 return 0;
865 name++;
866 }
867 return 1;
1da177e4 868}
d1b19dff 869EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
870
871/**
b267b179
EB
872 * __dev_alloc_name - allocate a name for a device
873 * @net: network namespace to allocate the device name in
1da177e4 874 * @name: name format string
b267b179 875 * @buf: scratch buffer and result name string
1da177e4
LT
876 *
877 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
878 * id. It scans list of devices to build up a free map, then chooses
879 * the first empty slot. The caller must hold the dev_base or rtnl lock
880 * while allocating the name and adding the device in order to avoid
881 * duplicates.
882 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
883 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
884 */
885
b267b179 886static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
887{
888 int i = 0;
1da177e4
LT
889 const char *p;
890 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 891 unsigned long *inuse;
1da177e4
LT
892 struct net_device *d;
893
894 p = strnchr(name, IFNAMSIZ-1, '%');
895 if (p) {
896 /*
897 * Verify the string as this thing may have come from
898 * the user. There must be either one "%d" and no other "%"
899 * characters.
900 */
901 if (p[1] != 'd' || strchr(p + 2, '%'))
902 return -EINVAL;
903
904 /* Use one page as a bit array of possible slots */
cfcabdcc 905 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
906 if (!inuse)
907 return -ENOMEM;
908
881d966b 909 for_each_netdev(net, d) {
1da177e4
LT
910 if (!sscanf(d->name, name, &i))
911 continue;
912 if (i < 0 || i >= max_netdevices)
913 continue;
914
915 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 916 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
917 if (!strncmp(buf, d->name, IFNAMSIZ))
918 set_bit(i, inuse);
919 }
920
921 i = find_first_zero_bit(inuse, max_netdevices);
922 free_page((unsigned long) inuse);
923 }
924
d9031024
OP
925 if (buf != name)
926 snprintf(buf, IFNAMSIZ, name, i);
b267b179 927 if (!__dev_get_by_name(net, buf))
1da177e4 928 return i;
1da177e4
LT
929
930 /* It is possible to run out of possible slots
931 * when the name is long and there isn't enough space left
932 * for the digits, or if all bits are used.
933 */
934 return -ENFILE;
935}
936
b267b179
EB
937/**
938 * dev_alloc_name - allocate a name for a device
939 * @dev: device
940 * @name: name format string
941 *
942 * Passed a format string - eg "lt%d" it will try and find a suitable
943 * id. It scans list of devices to build up a free map, then chooses
944 * the first empty slot. The caller must hold the dev_base or rtnl lock
945 * while allocating the name and adding the device in order to avoid
946 * duplicates.
947 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
948 * Returns the number of the unit assigned or a negative errno code.
949 */
950
951int dev_alloc_name(struct net_device *dev, const char *name)
952{
953 char buf[IFNAMSIZ];
954 struct net *net;
955 int ret;
956
c346dca1
YH
957 BUG_ON(!dev_net(dev));
958 net = dev_net(dev);
b267b179
EB
959 ret = __dev_alloc_name(net, name, buf);
960 if (ret >= 0)
961 strlcpy(dev->name, buf, IFNAMSIZ);
962 return ret;
963}
d1b19dff 964EXPORT_SYMBOL(dev_alloc_name);
b267b179 965
1c5cae81 966static int dev_get_valid_name(struct net_device *dev, const char *name)
d9031024 967{
8ce6cebc
DL
968 struct net *net;
969
970 BUG_ON(!dev_net(dev));
971 net = dev_net(dev);
972
d9031024
OP
973 if (!dev_valid_name(name))
974 return -EINVAL;
975
1c5cae81 976 if (strchr(name, '%'))
8ce6cebc 977 return dev_alloc_name(dev, name);
d9031024
OP
978 else if (__dev_get_by_name(net, name))
979 return -EEXIST;
8ce6cebc
DL
980 else if (dev->name != name)
981 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
982
983 return 0;
984}
1da177e4
LT
985
986/**
987 * dev_change_name - change name of a device
988 * @dev: device
989 * @newname: name (or format string) must be at least IFNAMSIZ
990 *
991 * Change name of a device, can pass format strings "eth%d".
992 * for wildcarding.
993 */
cf04a4c7 994int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 995{
fcc5a03a 996 char oldname[IFNAMSIZ];
1da177e4 997 int err = 0;
fcc5a03a 998 int ret;
881d966b 999 struct net *net;
1da177e4
LT
1000
1001 ASSERT_RTNL();
c346dca1 1002 BUG_ON(!dev_net(dev));
1da177e4 1003
c346dca1 1004 net = dev_net(dev);
1da177e4
LT
1005 if (dev->flags & IFF_UP)
1006 return -EBUSY;
1007
c8d90dca
SH
1008 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1009 return 0;
1010
fcc5a03a
HX
1011 memcpy(oldname, dev->name, IFNAMSIZ);
1012
1c5cae81 1013 err = dev_get_valid_name(dev, newname);
d9031024
OP
1014 if (err < 0)
1015 return err;
1da177e4 1016
fcc5a03a 1017rollback:
a1b3f594
EB
1018 ret = device_rename(&dev->dev, dev->name);
1019 if (ret) {
1020 memcpy(dev->name, oldname, IFNAMSIZ);
1021 return ret;
dcc99773 1022 }
7f988eab
HX
1023
1024 write_lock_bh(&dev_base_lock);
372b2312 1025 hlist_del_rcu(&dev->name_hlist);
72c9528b
ED
1026 write_unlock_bh(&dev_base_lock);
1027
1028 synchronize_rcu();
1029
1030 write_lock_bh(&dev_base_lock);
1031 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1032 write_unlock_bh(&dev_base_lock);
1033
056925ab 1034 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1035 ret = notifier_to_errno(ret);
1036
1037 if (ret) {
91e9c07b
ED
1038 /* err >= 0 after dev_alloc_name() or stores the first errno */
1039 if (err >= 0) {
fcc5a03a
HX
1040 err = ret;
1041 memcpy(dev->name, oldname, IFNAMSIZ);
1042 goto rollback;
91e9c07b
ED
1043 } else {
1044 printk(KERN_ERR
1045 "%s: name change rollback failed: %d.\n",
1046 dev->name, ret);
fcc5a03a
HX
1047 }
1048 }
1da177e4
LT
1049
1050 return err;
1051}
1052
0b815a1a
SH
1053/**
1054 * dev_set_alias - change ifalias of a device
1055 * @dev: device
1056 * @alias: name up to IFALIASZ
f0db275a 1057 * @len: limit of bytes to copy from info
0b815a1a
SH
1058 *
1059 * Set ifalias for a device,
1060 */
1061int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1062{
1063 ASSERT_RTNL();
1064
1065 if (len >= IFALIASZ)
1066 return -EINVAL;
1067
96ca4a2c
OH
1068 if (!len) {
1069 if (dev->ifalias) {
1070 kfree(dev->ifalias);
1071 dev->ifalias = NULL;
1072 }
1073 return 0;
1074 }
1075
d1b19dff 1076 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
0b815a1a
SH
1077 if (!dev->ifalias)
1078 return -ENOMEM;
1079
1080 strlcpy(dev->ifalias, alias, len+1);
1081 return len;
1082}
1083
1084
d8a33ac4 1085/**
3041a069 1086 * netdev_features_change - device changes features
d8a33ac4
SH
1087 * @dev: device to cause notification
1088 *
1089 * Called to indicate a device has changed features.
1090 */
1091void netdev_features_change(struct net_device *dev)
1092{
056925ab 1093 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1094}
1095EXPORT_SYMBOL(netdev_features_change);
1096
1da177e4
LT
1097/**
1098 * netdev_state_change - device changes state
1099 * @dev: device to cause notification
1100 *
1101 * Called to indicate a device has changed state. This function calls
1102 * the notifier chains for netdev_chain and sends a NEWLINK message
1103 * to the routing socket.
1104 */
1105void netdev_state_change(struct net_device *dev)
1106{
1107 if (dev->flags & IFF_UP) {
056925ab 1108 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1da177e4
LT
1109 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1110 }
1111}
d1b19dff 1112EXPORT_SYMBOL(netdev_state_change);
1da177e4 1113
3ca5b404 1114int netdev_bonding_change(struct net_device *dev, unsigned long event)
c1da4ac7 1115{
3ca5b404 1116 return call_netdevice_notifiers(event, dev);
c1da4ac7
OG
1117}
1118EXPORT_SYMBOL(netdev_bonding_change);
1119
1da177e4
LT
1120/**
1121 * dev_load - load a network module
c4ea43c5 1122 * @net: the applicable net namespace
1da177e4
LT
1123 * @name: name of interface
1124 *
1125 * If a network interface is not present and the process has suitable
1126 * privileges this function loads the module. If module loading is not
1127 * available in this kernel then it becomes a nop.
1128 */
1129
881d966b 1130void dev_load(struct net *net, const char *name)
1da177e4 1131{
4ec93edb 1132 struct net_device *dev;
8909c9ad 1133 int no_module;
1da177e4 1134
72c9528b
ED
1135 rcu_read_lock();
1136 dev = dev_get_by_name_rcu(net, name);
1137 rcu_read_unlock();
1da177e4 1138
8909c9ad
VK
1139 no_module = !dev;
1140 if (no_module && capable(CAP_NET_ADMIN))
1141 no_module = request_module("netdev-%s", name);
1142 if (no_module && capable(CAP_SYS_MODULE)) {
1143 if (!request_module("%s", name))
1144 pr_err("Loading kernel module for a network device "
1145"with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s "
1146"instead\n", name);
1147 }
1da177e4 1148}
d1b19dff 1149EXPORT_SYMBOL(dev_load);
1da177e4 1150
bd380811 1151static int __dev_open(struct net_device *dev)
1da177e4 1152{
d314774c 1153 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1154 int ret;
1da177e4 1155
e46b66bc
BH
1156 ASSERT_RTNL();
1157
1da177e4
LT
1158 if (!netif_device_present(dev))
1159 return -ENODEV;
1160
3b8bcfd5
JB
1161 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1162 ret = notifier_to_errno(ret);
1163 if (ret)
1164 return ret;
1165
1da177e4 1166 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1167
d314774c
SH
1168 if (ops->ndo_validate_addr)
1169 ret = ops->ndo_validate_addr(dev);
bada339b 1170
d314774c
SH
1171 if (!ret && ops->ndo_open)
1172 ret = ops->ndo_open(dev);
1da177e4 1173
bada339b
JG
1174 if (ret)
1175 clear_bit(__LINK_STATE_START, &dev->state);
1176 else {
1da177e4 1177 dev->flags |= IFF_UP;
b4bd07c2 1178 net_dmaengine_get();
4417da66 1179 dev_set_rx_mode(dev);
1da177e4 1180 dev_activate(dev);
1da177e4 1181 }
bada339b 1182
1da177e4
LT
1183 return ret;
1184}
1185
1186/**
bd380811
PM
1187 * dev_open - prepare an interface for use.
1188 * @dev: device to open
1da177e4 1189 *
bd380811
PM
1190 * Takes a device from down to up state. The device's private open
1191 * function is invoked and then the multicast lists are loaded. Finally
1192 * the device is moved into the up state and a %NETDEV_UP message is
1193 * sent to the netdev notifier chain.
1194 *
1195 * Calling this function on an active interface is a nop. On a failure
1196 * a negative errno code is returned.
1da177e4 1197 */
bd380811
PM
1198int dev_open(struct net_device *dev)
1199{
1200 int ret;
1201
bd380811
PM
1202 if (dev->flags & IFF_UP)
1203 return 0;
1204
bd380811
PM
1205 ret = __dev_open(dev);
1206 if (ret < 0)
1207 return ret;
1208
bd380811
PM
1209 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1210 call_netdevice_notifiers(NETDEV_UP, dev);
1211
1212 return ret;
1213}
1214EXPORT_SYMBOL(dev_open);
1215
44345724 1216static int __dev_close_many(struct list_head *head)
1da177e4 1217{
44345724 1218 struct net_device *dev;
e46b66bc 1219
bd380811 1220 ASSERT_RTNL();
9d5010db
DM
1221 might_sleep();
1222
44345724 1223 list_for_each_entry(dev, head, unreg_list) {
44345724 1224 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1225
44345724 1226 clear_bit(__LINK_STATE_START, &dev->state);
1da177e4 1227
44345724
OP
1228 /* Synchronize to scheduled poll. We cannot touch poll list, it
1229 * can be even on different cpu. So just clear netif_running().
1230 *
1231 * dev->stop() will invoke napi_disable() on all of it's
1232 * napi_struct instances on this device.
1233 */
1234 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1235 }
1da177e4 1236
44345724 1237 dev_deactivate_many(head);
d8b2a4d2 1238
44345724
OP
1239 list_for_each_entry(dev, head, unreg_list) {
1240 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4 1241
44345724
OP
1242 /*
1243 * Call the device specific close. This cannot fail.
1244 * Only if device is UP
1245 *
1246 * We allow it to be called even after a DETACH hot-plug
1247 * event.
1248 */
1249 if (ops->ndo_stop)
1250 ops->ndo_stop(dev);
1251
44345724 1252 dev->flags &= ~IFF_UP;
44345724
OP
1253 net_dmaengine_put();
1254 }
1255
1256 return 0;
1257}
1258
1259static int __dev_close(struct net_device *dev)
1260{
f87e6f47 1261 int retval;
44345724
OP
1262 LIST_HEAD(single);
1263
1264 list_add(&dev->unreg_list, &single);
f87e6f47
LT
1265 retval = __dev_close_many(&single);
1266 list_del(&single);
1267 return retval;
44345724
OP
1268}
1269
3fbd8758 1270static int dev_close_many(struct list_head *head)
44345724
OP
1271{
1272 struct net_device *dev, *tmp;
1273 LIST_HEAD(tmp_list);
1da177e4 1274
44345724
OP
1275 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1276 if (!(dev->flags & IFF_UP))
1277 list_move(&dev->unreg_list, &tmp_list);
1278
1279 __dev_close_many(head);
1da177e4 1280
44345724
OP
1281 list_for_each_entry(dev, head, unreg_list) {
1282 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1283 call_netdevice_notifiers(NETDEV_DOWN, dev);
1284 }
bd380811 1285
44345724
OP
1286 /* rollback_registered_many needs the complete original list */
1287 list_splice(&tmp_list, head);
bd380811
PM
1288 return 0;
1289}
1290
1291/**
1292 * dev_close - shutdown an interface.
1293 * @dev: device to shutdown
1294 *
1295 * This function moves an active device into down state. A
1296 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1297 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1298 * chain.
1299 */
1300int dev_close(struct net_device *dev)
1301{
e14a5993
ED
1302 if (dev->flags & IFF_UP) {
1303 LIST_HEAD(single);
1da177e4 1304
e14a5993
ED
1305 list_add(&dev->unreg_list, &single);
1306 dev_close_many(&single);
1307 list_del(&single);
1308 }
1da177e4
LT
1309 return 0;
1310}
d1b19dff 1311EXPORT_SYMBOL(dev_close);
1da177e4
LT
1312
1313
0187bdfb
BH
1314/**
1315 * dev_disable_lro - disable Large Receive Offload on a device
1316 * @dev: device
1317 *
1318 * Disable Large Receive Offload (LRO) on a net device. Must be
1319 * called under RTNL. This is needed if received packets may be
1320 * forwarded to another interface.
1321 */
1322void dev_disable_lro(struct net_device *dev)
1323{
27660515
MM
1324 u32 flags;
1325
f11970e3
NH
1326 /*
1327 * If we're trying to disable lro on a vlan device
1328 * use the underlying physical device instead
1329 */
1330 if (is_vlan_dev(dev))
1331 dev = vlan_dev_real_dev(dev);
1332
27660515
MM
1333 if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1334 flags = dev->ethtool_ops->get_flags(dev);
1335 else
1336 flags = ethtool_op_get_flags(dev);
1337
1338 if (!(flags & ETH_FLAG_LRO))
1339 return;
1340
1341 __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
22d5969f
MM
1342 if (unlikely(dev->features & NETIF_F_LRO))
1343 netdev_WARN(dev, "failed to disable LRO!\n");
0187bdfb
BH
1344}
1345EXPORT_SYMBOL(dev_disable_lro);
1346
1347
881d966b
EB
1348static int dev_boot_phase = 1;
1349
1da177e4
LT
1350/**
1351 * register_netdevice_notifier - register a network notifier block
1352 * @nb: notifier
1353 *
1354 * Register a notifier to be called when network device events occur.
1355 * The notifier passed is linked into the kernel structures and must
1356 * not be reused until it has been unregistered. A negative errno code
1357 * is returned on a failure.
1358 *
1359 * When registered all registration and up events are replayed
4ec93edb 1360 * to the new notifier to allow device to have a race free
1da177e4
LT
1361 * view of the network device list.
1362 */
1363
1364int register_netdevice_notifier(struct notifier_block *nb)
1365{
1366 struct net_device *dev;
fcc5a03a 1367 struct net_device *last;
881d966b 1368 struct net *net;
1da177e4
LT
1369 int err;
1370
1371 rtnl_lock();
f07d5b94 1372 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1373 if (err)
1374 goto unlock;
881d966b
EB
1375 if (dev_boot_phase)
1376 goto unlock;
1377 for_each_net(net) {
1378 for_each_netdev(net, dev) {
1379 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1380 err = notifier_to_errno(err);
1381 if (err)
1382 goto rollback;
1383
1384 if (!(dev->flags & IFF_UP))
1385 continue;
1da177e4 1386
881d966b
EB
1387 nb->notifier_call(nb, NETDEV_UP, dev);
1388 }
1da177e4 1389 }
fcc5a03a
HX
1390
1391unlock:
1da177e4
LT
1392 rtnl_unlock();
1393 return err;
fcc5a03a
HX
1394
1395rollback:
1396 last = dev;
881d966b
EB
1397 for_each_net(net) {
1398 for_each_netdev(net, dev) {
1399 if (dev == last)
1400 break;
fcc5a03a 1401
881d966b
EB
1402 if (dev->flags & IFF_UP) {
1403 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1404 nb->notifier_call(nb, NETDEV_DOWN, dev);
1405 }
1406 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
a5ee1551 1407 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
fcc5a03a 1408 }
fcc5a03a 1409 }
c67625a1
PE
1410
1411 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1412 goto unlock;
1da177e4 1413}
d1b19dff 1414EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1415
1416/**
1417 * unregister_netdevice_notifier - unregister a network notifier block
1418 * @nb: notifier
1419 *
1420 * Unregister a notifier previously registered by
1421 * register_netdevice_notifier(). The notifier is unlinked into the
1422 * kernel structures and may then be reused. A negative errno code
1423 * is returned on a failure.
1424 */
1425
1426int unregister_netdevice_notifier(struct notifier_block *nb)
1427{
9f514950
HX
1428 int err;
1429
1430 rtnl_lock();
f07d5b94 1431 err = raw_notifier_chain_unregister(&netdev_chain, nb);
9f514950
HX
1432 rtnl_unlock();
1433 return err;
1da177e4 1434}
d1b19dff 1435EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4
LT
1436
1437/**
1438 * call_netdevice_notifiers - call all network notifier blocks
1439 * @val: value passed unmodified to notifier function
c4ea43c5 1440 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1441 *
1442 * Call all network notifier blocks. Parameters and return value
f07d5b94 1443 * are as for raw_notifier_call_chain().
1da177e4
LT
1444 */
1445
ad7379d4 1446int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1447{
ab930471 1448 ASSERT_RTNL();
ad7379d4 1449 return raw_notifier_call_chain(&netdev_chain, val, dev);
1da177e4 1450}
edf947f1 1451EXPORT_SYMBOL(call_netdevice_notifiers);
1da177e4 1452
588f0330 1453static struct jump_label_key netstamp_needed __read_mostly;
1da177e4
LT
1454
1455void net_enable_timestamp(void)
1456{
588f0330 1457 jump_label_inc(&netstamp_needed);
1da177e4 1458}
d1b19dff 1459EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1460
1461void net_disable_timestamp(void)
1462{
588f0330 1463 jump_label_dec(&netstamp_needed);
1da177e4 1464}
d1b19dff 1465EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1466
3b098e2d 1467static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4 1468{
588f0330
ED
1469 skb->tstamp.tv64 = 0;
1470 if (static_branch(&netstamp_needed))
a61bbcf2 1471 __net_timestamp(skb);
1da177e4
LT
1472}
1473
588f0330
ED
1474#define net_timestamp_check(COND, SKB) \
1475 if (static_branch(&netstamp_needed)) { \
1476 if ((COND) && !(SKB)->tstamp.tv64) \
1477 __net_timestamp(SKB); \
1478 } \
3b098e2d 1479
4dc360c5
RC
1480static int net_hwtstamp_validate(struct ifreq *ifr)
1481{
1482 struct hwtstamp_config cfg;
1483 enum hwtstamp_tx_types tx_type;
1484 enum hwtstamp_rx_filters rx_filter;
1485 int tx_type_valid = 0;
1486 int rx_filter_valid = 0;
1487
1488 if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1489 return -EFAULT;
1490
1491 if (cfg.flags) /* reserved for future extensions */
1492 return -EINVAL;
1493
1494 tx_type = cfg.tx_type;
1495 rx_filter = cfg.rx_filter;
1496
1497 switch (tx_type) {
1498 case HWTSTAMP_TX_OFF:
1499 case HWTSTAMP_TX_ON:
1500 case HWTSTAMP_TX_ONESTEP_SYNC:
1501 tx_type_valid = 1;
1502 break;
1503 }
1504
1505 switch (rx_filter) {
1506 case HWTSTAMP_FILTER_NONE:
1507 case HWTSTAMP_FILTER_ALL:
1508 case HWTSTAMP_FILTER_SOME:
1509 case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1510 case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1511 case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1512 case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1513 case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1514 case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1515 case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1516 case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1517 case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1518 case HWTSTAMP_FILTER_PTP_V2_EVENT:
1519 case HWTSTAMP_FILTER_PTP_V2_SYNC:
1520 case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1521 rx_filter_valid = 1;
1522 break;
1523 }
1524
1525 if (!tx_type_valid || !rx_filter_valid)
1526 return -ERANGE;
1527
1528 return 0;
1529}
1530
79b569f0
DL
1531static inline bool is_skb_forwardable(struct net_device *dev,
1532 struct sk_buff *skb)
1533{
1534 unsigned int len;
1535
1536 if (!(dev->flags & IFF_UP))
1537 return false;
1538
1539 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1540 if (skb->len <= len)
1541 return true;
1542
1543 /* if TSO is enabled, we don't care about the length as the packet
1544 * could be forwarded without being segmented before
1545 */
1546 if (skb_is_gso(skb))
1547 return true;
1548
1549 return false;
1550}
1551
44540960
AB
1552/**
1553 * dev_forward_skb - loopback an skb to another netif
1554 *
1555 * @dev: destination network device
1556 * @skb: buffer to forward
1557 *
1558 * return values:
1559 * NET_RX_SUCCESS (no congestion)
6ec82562 1560 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1561 *
1562 * dev_forward_skb can be used for injecting an skb from the
1563 * start_xmit function of one device into the receive queue
1564 * of another device.
1565 *
1566 * The receiving device may be in another namespace, so
1567 * we have to clear all information in the skb that could
1568 * impact namespace isolation.
1569 */
1570int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1571{
48c83012
MT
1572 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1573 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1574 atomic_long_inc(&dev->rx_dropped);
1575 kfree_skb(skb);
1576 return NET_RX_DROP;
1577 }
1578 }
1579
44540960 1580 skb_orphan(skb);
c736eefa 1581 nf_reset(skb);
44540960 1582
79b569f0 1583 if (unlikely(!is_skb_forwardable(dev, skb))) {
caf586e5 1584 atomic_long_inc(&dev->rx_dropped);
6ec82562 1585 kfree_skb(skb);
44540960 1586 return NET_RX_DROP;
6ec82562 1587 }
8a83a00b 1588 skb_set_dev(skb, dev);
44540960
AB
1589 skb->tstamp.tv64 = 0;
1590 skb->pkt_type = PACKET_HOST;
1591 skb->protocol = eth_type_trans(skb, dev);
44540960
AB
1592 return netif_rx(skb);
1593}
1594EXPORT_SYMBOL_GPL(dev_forward_skb);
1595
71d9dec2
CG
1596static inline int deliver_skb(struct sk_buff *skb,
1597 struct packet_type *pt_prev,
1598 struct net_device *orig_dev)
1599{
1600 atomic_inc(&skb->users);
1601 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1602}
1603
1da177e4
LT
1604/*
1605 * Support routine. Sends outgoing frames to any network
1606 * taps currently in use.
1607 */
1608
f6a78bfc 1609static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1610{
1611 struct packet_type *ptype;
71d9dec2
CG
1612 struct sk_buff *skb2 = NULL;
1613 struct packet_type *pt_prev = NULL;
a61bbcf2 1614
1da177e4
LT
1615 rcu_read_lock();
1616 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1617 /* Never send packets back to the socket
1618 * they originated from - MvS (miquels@drinkel.ow.org)
1619 */
1620 if ((ptype->dev == dev || !ptype->dev) &&
1621 (ptype->af_packet_priv == NULL ||
1622 (struct sock *)ptype->af_packet_priv != skb->sk)) {
71d9dec2
CG
1623 if (pt_prev) {
1624 deliver_skb(skb2, pt_prev, skb->dev);
1625 pt_prev = ptype;
1626 continue;
1627 }
1628
1629 skb2 = skb_clone(skb, GFP_ATOMIC);
1da177e4
LT
1630 if (!skb2)
1631 break;
1632
70978182
ED
1633 net_timestamp_set(skb2);
1634
1da177e4
LT
1635 /* skb->nh should be correctly
1636 set by sender, so that the second statement is
1637 just protection against buggy protocols.
1638 */
459a98ed 1639 skb_reset_mac_header(skb2);
1da177e4 1640
d56f90a7 1641 if (skb_network_header(skb2) < skb2->data ||
27a884dc 1642 skb2->network_header > skb2->tail) {
1da177e4
LT
1643 if (net_ratelimit())
1644 printk(KERN_CRIT "protocol %04x is "
1645 "buggy, dev %s\n",
70777d03
SAS
1646 ntohs(skb2->protocol),
1647 dev->name);
c1d2bbe1 1648 skb_reset_network_header(skb2);
1da177e4
LT
1649 }
1650
b0e380b1 1651 skb2->transport_header = skb2->network_header;
1da177e4 1652 skb2->pkt_type = PACKET_OUTGOING;
71d9dec2 1653 pt_prev = ptype;
1da177e4
LT
1654 }
1655 }
71d9dec2
CG
1656 if (pt_prev)
1657 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1da177e4
LT
1658 rcu_read_unlock();
1659}
1660
4f57c087
JF
1661/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1662 * @dev: Network device
1663 * @txq: number of queues available
1664 *
1665 * If real_num_tx_queues is changed the tc mappings may no longer be
1666 * valid. To resolve this verify the tc mapping remains valid and if
1667 * not NULL the mapping. With no priorities mapping to this
1668 * offset/count pair it will no longer be used. In the worst case TC0
1669 * is invalid nothing can be done so disable priority mappings. If is
1670 * expected that drivers will fix this mapping if they can before
1671 * calling netif_set_real_num_tx_queues.
1672 */
bb134d22 1673static void netif_setup_tc(struct net_device *dev, unsigned int txq)
4f57c087
JF
1674{
1675 int i;
1676 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1677
1678 /* If TC0 is invalidated disable TC mapping */
1679 if (tc->offset + tc->count > txq) {
1680 pr_warning("Number of in use tx queues changed "
1681 "invalidating tc mappings. Priority "
1682 "traffic classification disabled!\n");
1683 dev->num_tc = 0;
1684 return;
1685 }
1686
1687 /* Invalidated prio to tc mappings set to TC0 */
1688 for (i = 1; i < TC_BITMASK + 1; i++) {
1689 int q = netdev_get_prio_tc_map(dev, i);
1690
1691 tc = &dev->tc_to_txq[q];
1692 if (tc->offset + tc->count > txq) {
1693 pr_warning("Number of in use tx queues "
1694 "changed. Priority %i to tc "
1695 "mapping %i is no longer valid "
1696 "setting map to 0\n",
1697 i, q);
1698 netdev_set_prio_tc_map(dev, i, 0);
1699 }
1700 }
1701}
1702
f0796d5c
JF
1703/*
1704 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1705 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1706 */
e6484930 1707int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
f0796d5c 1708{
1d24eb48
TH
1709 int rc;
1710
e6484930
TH
1711 if (txq < 1 || txq > dev->num_tx_queues)
1712 return -EINVAL;
f0796d5c 1713
5c56580b
BH
1714 if (dev->reg_state == NETREG_REGISTERED ||
1715 dev->reg_state == NETREG_UNREGISTERING) {
e6484930
TH
1716 ASSERT_RTNL();
1717
1d24eb48
TH
1718 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1719 txq);
bf264145
TH
1720 if (rc)
1721 return rc;
1722
4f57c087
JF
1723 if (dev->num_tc)
1724 netif_setup_tc(dev, txq);
1725
e6484930
TH
1726 if (txq < dev->real_num_tx_queues)
1727 qdisc_reset_all_tx_gt(dev, txq);
f0796d5c 1728 }
e6484930
TH
1729
1730 dev->real_num_tx_queues = txq;
1731 return 0;
f0796d5c
JF
1732}
1733EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 1734
62fe0b40
BH
1735#ifdef CONFIG_RPS
1736/**
1737 * netif_set_real_num_rx_queues - set actual number of RX queues used
1738 * @dev: Network device
1739 * @rxq: Actual number of RX queues
1740 *
1741 * This must be called either with the rtnl_lock held or before
1742 * registration of the net device. Returns 0 on success, or a
4e7f7951
BH
1743 * negative error code. If called before registration, it always
1744 * succeeds.
62fe0b40
BH
1745 */
1746int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1747{
1748 int rc;
1749
bd25fa7b
TH
1750 if (rxq < 1 || rxq > dev->num_rx_queues)
1751 return -EINVAL;
1752
62fe0b40
BH
1753 if (dev->reg_state == NETREG_REGISTERED) {
1754 ASSERT_RTNL();
1755
62fe0b40
BH
1756 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1757 rxq);
1758 if (rc)
1759 return rc;
62fe0b40
BH
1760 }
1761
1762 dev->real_num_rx_queues = rxq;
1763 return 0;
1764}
1765EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1766#endif
1767
def82a1d 1768static inline void __netif_reschedule(struct Qdisc *q)
56079431 1769{
def82a1d
JP
1770 struct softnet_data *sd;
1771 unsigned long flags;
56079431 1772
def82a1d
JP
1773 local_irq_save(flags);
1774 sd = &__get_cpu_var(softnet_data);
a9cbd588
CG
1775 q->next_sched = NULL;
1776 *sd->output_queue_tailp = q;
1777 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
1778 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1779 local_irq_restore(flags);
1780}
1781
1782void __netif_schedule(struct Qdisc *q)
1783{
1784 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1785 __netif_reschedule(q);
56079431
DV
1786}
1787EXPORT_SYMBOL(__netif_schedule);
1788
bea3348e 1789void dev_kfree_skb_irq(struct sk_buff *skb)
56079431 1790{
3578b0c8 1791 if (atomic_dec_and_test(&skb->users)) {
bea3348e
SH
1792 struct softnet_data *sd;
1793 unsigned long flags;
56079431 1794
bea3348e
SH
1795 local_irq_save(flags);
1796 sd = &__get_cpu_var(softnet_data);
1797 skb->next = sd->completion_queue;
1798 sd->completion_queue = skb;
1799 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1800 local_irq_restore(flags);
1801 }
56079431 1802}
bea3348e 1803EXPORT_SYMBOL(dev_kfree_skb_irq);
56079431
DV
1804
1805void dev_kfree_skb_any(struct sk_buff *skb)
1806{
1807 if (in_irq() || irqs_disabled())
1808 dev_kfree_skb_irq(skb);
1809 else
1810 dev_kfree_skb(skb);
1811}
1812EXPORT_SYMBOL(dev_kfree_skb_any);
1813
1814
bea3348e
SH
1815/**
1816 * netif_device_detach - mark device as removed
1817 * @dev: network device
1818 *
1819 * Mark device as removed from system and therefore no longer available.
1820 */
56079431
DV
1821void netif_device_detach(struct net_device *dev)
1822{
1823 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1824 netif_running(dev)) {
d543103a 1825 netif_tx_stop_all_queues(dev);
56079431
DV
1826 }
1827}
1828EXPORT_SYMBOL(netif_device_detach);
1829
bea3348e
SH
1830/**
1831 * netif_device_attach - mark device as attached
1832 * @dev: network device
1833 *
1834 * Mark device as attached from system and restart if needed.
1835 */
56079431
DV
1836void netif_device_attach(struct net_device *dev)
1837{
1838 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1839 netif_running(dev)) {
d543103a 1840 netif_tx_wake_all_queues(dev);
4ec93edb 1841 __netdev_watchdog_up(dev);
56079431
DV
1842 }
1843}
1844EXPORT_SYMBOL(netif_device_attach);
1845
8a83a00b
AB
1846/**
1847 * skb_dev_set -- assign a new device to a buffer
1848 * @skb: buffer for the new device
1849 * @dev: network device
1850 *
1851 * If an skb is owned by a device already, we have to reset
1852 * all data private to the namespace a device belongs to
1853 * before assigning it a new device.
1854 */
1855#ifdef CONFIG_NET_NS
1856void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1857{
1858 skb_dst_drop(skb);
1859 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1860 secpath_reset(skb);
1861 nf_reset(skb);
1862 skb_init_secmark(skb);
1863 skb->mark = 0;
1864 skb->priority = 0;
1865 skb->nf_trace = 0;
1866 skb->ipvs_property = 0;
1867#ifdef CONFIG_NET_SCHED
1868 skb->tc_index = 0;
1869#endif
1870 }
1871 skb->dev = dev;
1872}
1873EXPORT_SYMBOL(skb_set_dev);
1874#endif /* CONFIG_NET_NS */
1875
1da177e4
LT
1876/*
1877 * Invalidate hardware checksum when packet is to be mangled, and
1878 * complete checksum manually on outgoing path.
1879 */
84fa7933 1880int skb_checksum_help(struct sk_buff *skb)
1da177e4 1881{
d3bc23e7 1882 __wsum csum;
663ead3b 1883 int ret = 0, offset;
1da177e4 1884
84fa7933 1885 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
1886 goto out_set_summed;
1887
1888 if (unlikely(skb_shinfo(skb)->gso_size)) {
a430a43d
HX
1889 /* Let GSO fix up the checksum. */
1890 goto out_set_summed;
1da177e4
LT
1891 }
1892
55508d60 1893 offset = skb_checksum_start_offset(skb);
a030847e
HX
1894 BUG_ON(offset >= skb_headlen(skb));
1895 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1896
1897 offset += skb->csum_offset;
1898 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1899
1900 if (skb_cloned(skb) &&
1901 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
1902 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1903 if (ret)
1904 goto out;
1905 }
1906
a030847e 1907 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 1908out_set_summed:
1da177e4 1909 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 1910out:
1da177e4
LT
1911 return ret;
1912}
d1b19dff 1913EXPORT_SYMBOL(skb_checksum_help);
1da177e4 1914
f6a78bfc
HX
1915/**
1916 * skb_gso_segment - Perform segmentation on skb.
1917 * @skb: buffer to segment
576a30eb 1918 * @features: features for the output path (see dev->features)
f6a78bfc
HX
1919 *
1920 * This function segments the given skb and returns a list of segments.
576a30eb
HX
1921 *
1922 * It may return NULL if the skb requires no segmentation. This is
1923 * only possible when GSO is used for verifying header integrity.
f6a78bfc 1924 */
04ed3e74 1925struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
f6a78bfc
HX
1926{
1927 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1928 struct packet_type *ptype;
252e3346 1929 __be16 type = skb->protocol;
c8d5bcd1 1930 int vlan_depth = ETH_HLEN;
a430a43d 1931 int err;
f6a78bfc 1932
c8d5bcd1
JG
1933 while (type == htons(ETH_P_8021Q)) {
1934 struct vlan_hdr *vh;
7b9c6090 1935
c8d5bcd1 1936 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
7b9c6090
JG
1937 return ERR_PTR(-EINVAL);
1938
c8d5bcd1
JG
1939 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1940 type = vh->h_vlan_encapsulated_proto;
1941 vlan_depth += VLAN_HLEN;
7b9c6090
JG
1942 }
1943
459a98ed 1944 skb_reset_mac_header(skb);
b0e380b1 1945 skb->mac_len = skb->network_header - skb->mac_header;
f6a78bfc
HX
1946 __skb_pull(skb, skb->mac_len);
1947
67fd1a73
HX
1948 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1949 struct net_device *dev = skb->dev;
1950 struct ethtool_drvinfo info = {};
1951
1952 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1953 dev->ethtool_ops->get_drvinfo(dev, &info);
1954
b194a367 1955 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
67fd1a73
HX
1956 info.driver, dev ? dev->features : 0L,
1957 skb->sk ? skb->sk->sk_route_caps : 0L,
1958 skb->len, skb->data_len, skb->ip_summed);
1959
a430a43d
HX
1960 if (skb_header_cloned(skb) &&
1961 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1962 return ERR_PTR(err);
1963 }
1964
f6a78bfc 1965 rcu_read_lock();
82d8a867
PE
1966 list_for_each_entry_rcu(ptype,
1967 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
f6a78bfc 1968 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
84fa7933 1969 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
a430a43d
HX
1970 err = ptype->gso_send_check(skb);
1971 segs = ERR_PTR(err);
1972 if (err || skb_gso_ok(skb, features))
1973 break;
d56f90a7
ACM
1974 __skb_push(skb, (skb->data -
1975 skb_network_header(skb)));
a430a43d 1976 }
576a30eb 1977 segs = ptype->gso_segment(skb, features);
f6a78bfc
HX
1978 break;
1979 }
1980 }
1981 rcu_read_unlock();
1982
98e399f8 1983 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 1984
f6a78bfc
HX
1985 return segs;
1986}
f6a78bfc
HX
1987EXPORT_SYMBOL(skb_gso_segment);
1988
fb286bb2
HX
1989/* Take action when hardware reception checksum errors are detected. */
1990#ifdef CONFIG_BUG
1991void netdev_rx_csum_fault(struct net_device *dev)
1992{
1993 if (net_ratelimit()) {
4ec93edb 1994 printk(KERN_ERR "%s: hw csum failure.\n",
246a4212 1995 dev ? dev->name : "<unknown>");
fb286bb2
HX
1996 dump_stack();
1997 }
1998}
1999EXPORT_SYMBOL(netdev_rx_csum_fault);
2000#endif
2001
1da177e4
LT
2002/* Actually, we should eliminate this check as soon as we know, that:
2003 * 1. IOMMU is present and allows to map all the memory.
2004 * 2. No high memory really exists on this machine.
2005 */
2006
9092c658 2007static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 2008{
3d3a8533 2009#ifdef CONFIG_HIGHMEM
1da177e4 2010 int i;
5acbbd42 2011 if (!(dev->features & NETIF_F_HIGHDMA)) {
ea2ab693
IC
2012 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2013 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2014 if (PageHighMem(skb_frag_page(frag)))
5acbbd42 2015 return 1;
ea2ab693 2016 }
5acbbd42 2017 }
1da177e4 2018
5acbbd42
FT
2019 if (PCI_DMA_BUS_IS_PHYS) {
2020 struct device *pdev = dev->dev.parent;
1da177e4 2021
9092c658
ED
2022 if (!pdev)
2023 return 0;
5acbbd42 2024 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
ea2ab693
IC
2025 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2026 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
5acbbd42
FT
2027 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2028 return 1;
2029 }
2030 }
3d3a8533 2031#endif
1da177e4
LT
2032 return 0;
2033}
1da177e4 2034
f6a78bfc
HX
2035struct dev_gso_cb {
2036 void (*destructor)(struct sk_buff *skb);
2037};
2038
2039#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2040
2041static void dev_gso_skb_destructor(struct sk_buff *skb)
2042{
2043 struct dev_gso_cb *cb;
2044
2045 do {
2046 struct sk_buff *nskb = skb->next;
2047
2048 skb->next = nskb->next;
2049 nskb->next = NULL;
2050 kfree_skb(nskb);
2051 } while (skb->next);
2052
2053 cb = DEV_GSO_CB(skb);
2054 if (cb->destructor)
2055 cb->destructor(skb);
2056}
2057
2058/**
2059 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2060 * @skb: buffer to segment
91ecb63c 2061 * @features: device features as applicable to this skb
f6a78bfc
HX
2062 *
2063 * This function segments the given skb and stores the list of segments
2064 * in skb->next.
2065 */
91ecb63c 2066static int dev_gso_segment(struct sk_buff *skb, int features)
f6a78bfc 2067{
f6a78bfc 2068 struct sk_buff *segs;
576a30eb
HX
2069
2070 segs = skb_gso_segment(skb, features);
2071
2072 /* Verifying header integrity only. */
2073 if (!segs)
2074 return 0;
f6a78bfc 2075
801678c5 2076 if (IS_ERR(segs))
f6a78bfc
HX
2077 return PTR_ERR(segs);
2078
2079 skb->next = segs;
2080 DEV_GSO_CB(skb)->destructor = skb->destructor;
2081 skb->destructor = dev_gso_skb_destructor;
2082
2083 return 0;
2084}
2085
fc6055a5
ED
2086/*
2087 * Try to orphan skb early, right before transmission by the device.
2244d07b
OH
2088 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2089 * is needed on driver level for other reasons, e.g. see net/can/raw.c
fc6055a5
ED
2090 */
2091static inline void skb_orphan_try(struct sk_buff *skb)
2092{
87fd308c
ED
2093 struct sock *sk = skb->sk;
2094
2244d07b 2095 if (sk && !skb_shinfo(skb)->tx_flags) {
87fd308c
ED
2096 /* skb_tx_hash() wont be able to get sk.
2097 * We copy sk_hash into skb->rxhash
2098 */
2099 if (!skb->rxhash)
2100 skb->rxhash = sk->sk_hash;
fc6055a5 2101 skb_orphan(skb);
87fd308c 2102 }
fc6055a5
ED
2103}
2104
03634668
JG
2105static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2106{
2107 return ((features & NETIF_F_GEN_CSUM) ||
2108 ((features & NETIF_F_V4_CSUM) &&
2109 protocol == htons(ETH_P_IP)) ||
2110 ((features & NETIF_F_V6_CSUM) &&
2111 protocol == htons(ETH_P_IPV6)) ||
2112 ((features & NETIF_F_FCOE_CRC) &&
2113 protocol == htons(ETH_P_FCOE)));
2114}
2115
04ed3e74 2116static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
f01a5236 2117{
d402786e 2118 if (!can_checksum_protocol(features, protocol)) {
f01a5236
JG
2119 features &= ~NETIF_F_ALL_CSUM;
2120 features &= ~NETIF_F_SG;
2121 } else if (illegal_highdma(skb->dev, skb)) {
2122 features &= ~NETIF_F_SG;
2123 }
2124
2125 return features;
2126}
2127
04ed3e74 2128u32 netif_skb_features(struct sk_buff *skb)
58e998c6
JG
2129{
2130 __be16 protocol = skb->protocol;
04ed3e74 2131 u32 features = skb->dev->features;
58e998c6
JG
2132
2133 if (protocol == htons(ETH_P_8021Q)) {
2134 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2135 protocol = veh->h_vlan_encapsulated_proto;
f01a5236
JG
2136 } else if (!vlan_tx_tag_present(skb)) {
2137 return harmonize_features(skb, protocol, features);
2138 }
58e998c6 2139
6ee400aa 2140 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
f01a5236
JG
2141
2142 if (protocol != htons(ETH_P_8021Q)) {
2143 return harmonize_features(skb, protocol, features);
2144 } else {
2145 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
6ee400aa 2146 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
f01a5236
JG
2147 return harmonize_features(skb, protocol, features);
2148 }
58e998c6 2149}
f01a5236 2150EXPORT_SYMBOL(netif_skb_features);
58e998c6 2151
6afff0ca
JF
2152/*
2153 * Returns true if either:
2154 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2155 * 2. skb is fragmented and the device does not support SG, or if
2156 * at least one of fragments is in highmem and device does not
2157 * support DMA from it.
2158 */
2159static inline int skb_needs_linearize(struct sk_buff *skb,
02932ce9 2160 int features)
6afff0ca 2161{
02932ce9
JG
2162 return skb_is_nonlinear(skb) &&
2163 ((skb_has_frag_list(skb) &&
2164 !(features & NETIF_F_FRAGLIST)) ||
e1e78db6 2165 (skb_shinfo(skb)->nr_frags &&
02932ce9 2166 !(features & NETIF_F_SG)));
6afff0ca
JF
2167}
2168
fd2ea0a7
DM
2169int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2170 struct netdev_queue *txq)
f6a78bfc 2171{
00829823 2172 const struct net_device_ops *ops = dev->netdev_ops;
572a9d7b 2173 int rc = NETDEV_TX_OK;
ec764bf0 2174 unsigned int skb_len;
00829823 2175
f6a78bfc 2176 if (likely(!skb->next)) {
04ed3e74 2177 u32 features;
fc741216 2178
93f154b5 2179 /*
25985edc 2180 * If device doesn't need skb->dst, release it right now while
93f154b5
ED
2181 * its hot in this cpu cache
2182 */
adf30907
ED
2183 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2184 skb_dst_drop(skb);
2185
15c2d75f
ED
2186 if (!list_empty(&ptype_all))
2187 dev_queue_xmit_nit(skb, dev);
2188
fc6055a5 2189 skb_orphan_try(skb);
9ccb8975 2190
fc741216
JG
2191 features = netif_skb_features(skb);
2192
7b9c6090 2193 if (vlan_tx_tag_present(skb) &&
fc741216 2194 !(features & NETIF_F_HW_VLAN_TX)) {
7b9c6090
JG
2195 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2196 if (unlikely(!skb))
2197 goto out;
2198
2199 skb->vlan_tci = 0;
2200 }
2201
fc741216 2202 if (netif_needs_gso(skb, features)) {
91ecb63c 2203 if (unlikely(dev_gso_segment(skb, features)))
9ccb8975
DM
2204 goto out_kfree_skb;
2205 if (skb->next)
2206 goto gso;
6afff0ca 2207 } else {
02932ce9 2208 if (skb_needs_linearize(skb, features) &&
6afff0ca
JF
2209 __skb_linearize(skb))
2210 goto out_kfree_skb;
2211
2212 /* If packet is not checksummed and device does not
2213 * support checksumming for this protocol, complete
2214 * checksumming here.
2215 */
2216 if (skb->ip_summed == CHECKSUM_PARTIAL) {
55508d60
MM
2217 skb_set_transport_header(skb,
2218 skb_checksum_start_offset(skb));
03634668 2219 if (!(features & NETIF_F_ALL_CSUM) &&
6afff0ca
JF
2220 skb_checksum_help(skb))
2221 goto out_kfree_skb;
2222 }
9ccb8975
DM
2223 }
2224
ec764bf0 2225 skb_len = skb->len;
ac45f602 2226 rc = ops->ndo_start_xmit(skb, dev);
ec764bf0 2227 trace_net_dev_xmit(skb, rc, dev, skb_len);
ec634fe3 2228 if (rc == NETDEV_TX_OK)
08baf561 2229 txq_trans_update(txq);
ac45f602 2230 return rc;
f6a78bfc
HX
2231 }
2232
576a30eb 2233gso:
f6a78bfc
HX
2234 do {
2235 struct sk_buff *nskb = skb->next;
f6a78bfc
HX
2236
2237 skb->next = nskb->next;
2238 nskb->next = NULL;
068a2de5
KK
2239
2240 /*
25985edc 2241 * If device doesn't need nskb->dst, release it right now while
068a2de5
KK
2242 * its hot in this cpu cache
2243 */
2244 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2245 skb_dst_drop(nskb);
2246
ec764bf0 2247 skb_len = nskb->len;
00829823 2248 rc = ops->ndo_start_xmit(nskb, dev);
ec764bf0 2249 trace_net_dev_xmit(nskb, rc, dev, skb_len);
ec634fe3 2250 if (unlikely(rc != NETDEV_TX_OK)) {
572a9d7b
PM
2251 if (rc & ~NETDEV_TX_MASK)
2252 goto out_kfree_gso_skb;
f54d9e8d 2253 nskb->next = skb->next;
f6a78bfc
HX
2254 skb->next = nskb;
2255 return rc;
2256 }
08baf561 2257 txq_trans_update(txq);
fd2ea0a7 2258 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
f54d9e8d 2259 return NETDEV_TX_BUSY;
f6a78bfc 2260 } while (skb->next);
4ec93edb 2261
572a9d7b
PM
2262out_kfree_gso_skb:
2263 if (likely(skb->next == NULL))
2264 skb->destructor = DEV_GSO_CB(skb)->destructor;
f6a78bfc
HX
2265out_kfree_skb:
2266 kfree_skb(skb);
7b9c6090 2267out:
572a9d7b 2268 return rc;
f6a78bfc
HX
2269}
2270
0a9627f2 2271static u32 hashrnd __read_mostly;
b6b2fed1 2272
a3d22a68
VZ
2273/*
2274 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2275 * to be used as a distribution range.
2276 */
2277u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2278 unsigned int num_tx_queues)
8f0f2223 2279{
7019298a 2280 u32 hash;
4f57c087
JF
2281 u16 qoffset = 0;
2282 u16 qcount = num_tx_queues;
b6b2fed1 2283
513de11b
DM
2284 if (skb_rx_queue_recorded(skb)) {
2285 hash = skb_get_rx_queue(skb);
a3d22a68
VZ
2286 while (unlikely(hash >= num_tx_queues))
2287 hash -= num_tx_queues;
513de11b
DM
2288 return hash;
2289 }
ec581f6a 2290
4f57c087
JF
2291 if (dev->num_tc) {
2292 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2293 qoffset = dev->tc_to_txq[tc].offset;
2294 qcount = dev->tc_to_txq[tc].count;
2295 }
2296
ec581f6a 2297 if (skb->sk && skb->sk->sk_hash)
7019298a 2298 hash = skb->sk->sk_hash;
ec581f6a 2299 else
87fd308c 2300 hash = (__force u16) skb->protocol ^ skb->rxhash;
0a9627f2 2301 hash = jhash_1word(hash, hashrnd);
b6b2fed1 2302
4f57c087 2303 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
8f0f2223 2304}
a3d22a68 2305EXPORT_SYMBOL(__skb_tx_hash);
8f0f2223 2306
ed04642f
ED
2307static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2308{
2309 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2310 if (net_ratelimit()) {
7a161ea9
ED
2311 pr_warning("%s selects TX queue %d, but "
2312 "real number of TX queues is %d\n",
2313 dev->name, queue_index, dev->real_num_tx_queues);
ed04642f
ED
2314 }
2315 return 0;
2316 }
2317 return queue_index;
2318}
2319
1d24eb48
TH
2320static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2321{
bf264145 2322#ifdef CONFIG_XPS
1d24eb48
TH
2323 struct xps_dev_maps *dev_maps;
2324 struct xps_map *map;
2325 int queue_index = -1;
2326
2327 rcu_read_lock();
2328 dev_maps = rcu_dereference(dev->xps_maps);
2329 if (dev_maps) {
2330 map = rcu_dereference(
2331 dev_maps->cpu_map[raw_smp_processor_id()]);
2332 if (map) {
2333 if (map->len == 1)
2334 queue_index = map->queues[0];
2335 else {
2336 u32 hash;
2337 if (skb->sk && skb->sk->sk_hash)
2338 hash = skb->sk->sk_hash;
2339 else
2340 hash = (__force u16) skb->protocol ^
2341 skb->rxhash;
2342 hash = jhash_1word(hash, hashrnd);
2343 queue_index = map->queues[
2344 ((u64)hash * map->len) >> 32];
2345 }
2346 if (unlikely(queue_index >= dev->real_num_tx_queues))
2347 queue_index = -1;
2348 }
2349 }
2350 rcu_read_unlock();
2351
2352 return queue_index;
2353#else
2354 return -1;
2355#endif
2356}
2357
e8a0464c
DM
2358static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2359 struct sk_buff *skb)
2360{
b0f77d0e 2361 int queue_index;
deabc772 2362 const struct net_device_ops *ops = dev->netdev_ops;
a4ee3ce3 2363
3853b584
TH
2364 if (dev->real_num_tx_queues == 1)
2365 queue_index = 0;
2366 else if (ops->ndo_select_queue) {
deabc772
HS
2367 queue_index = ops->ndo_select_queue(dev, skb);
2368 queue_index = dev_cap_txqueue(dev, queue_index);
2369 } else {
2370 struct sock *sk = skb->sk;
2371 queue_index = sk_tx_queue_get(sk);
a4ee3ce3 2372
3853b584
TH
2373 if (queue_index < 0 || skb->ooo_okay ||
2374 queue_index >= dev->real_num_tx_queues) {
2375 int old_index = queue_index;
fd2ea0a7 2376
1d24eb48
TH
2377 queue_index = get_xps_queue(dev, skb);
2378 if (queue_index < 0)
2379 queue_index = skb_tx_hash(dev, skb);
3853b584
TH
2380
2381 if (queue_index != old_index && sk) {
2382 struct dst_entry *dst =
2383 rcu_dereference_check(sk->sk_dst_cache, 1);
8728c544
ED
2384
2385 if (dst && skb_dst(skb) == dst)
2386 sk_tx_queue_set(sk, queue_index);
2387 }
a4ee3ce3
KK
2388 }
2389 }
eae792b7 2390
fd2ea0a7
DM
2391 skb_set_queue_mapping(skb, queue_index);
2392 return netdev_get_tx_queue(dev, queue_index);
e8a0464c
DM
2393}
2394
bbd8a0d3
KK
2395static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2396 struct net_device *dev,
2397 struct netdev_queue *txq)
2398{
2399 spinlock_t *root_lock = qdisc_lock(q);
a2da570d 2400 bool contended;
bbd8a0d3
KK
2401 int rc;
2402
a2da570d
ED
2403 qdisc_skb_cb(skb)->pkt_len = skb->len;
2404 qdisc_calculate_pkt_len(skb, q);
79640a4c
ED
2405 /*
2406 * Heuristic to force contended enqueues to serialize on a
2407 * separate lock before trying to get qdisc main lock.
2408 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2409 * and dequeue packets faster.
2410 */
a2da570d 2411 contended = qdisc_is_running(q);
79640a4c
ED
2412 if (unlikely(contended))
2413 spin_lock(&q->busylock);
2414
bbd8a0d3
KK
2415 spin_lock(root_lock);
2416 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2417 kfree_skb(skb);
2418 rc = NET_XMIT_DROP;
2419 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 2420 qdisc_run_begin(q)) {
bbd8a0d3
KK
2421 /*
2422 * This is a work-conserving queue; there are no old skbs
2423 * waiting to be sent out; and the qdisc is not running -
2424 * xmit the skb directly.
2425 */
7fee226a
ED
2426 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2427 skb_dst_force(skb);
bfe0d029 2428
bfe0d029
ED
2429 qdisc_bstats_update(q, skb);
2430
79640a4c
ED
2431 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2432 if (unlikely(contended)) {
2433 spin_unlock(&q->busylock);
2434 contended = false;
2435 }
bbd8a0d3 2436 __qdisc_run(q);
79640a4c 2437 } else
bc135b23 2438 qdisc_run_end(q);
bbd8a0d3
KK
2439
2440 rc = NET_XMIT_SUCCESS;
2441 } else {
7fee226a 2442 skb_dst_force(skb);
a2da570d 2443 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
79640a4c
ED
2444 if (qdisc_run_begin(q)) {
2445 if (unlikely(contended)) {
2446 spin_unlock(&q->busylock);
2447 contended = false;
2448 }
2449 __qdisc_run(q);
2450 }
bbd8a0d3
KK
2451 }
2452 spin_unlock(root_lock);
79640a4c
ED
2453 if (unlikely(contended))
2454 spin_unlock(&q->busylock);
bbd8a0d3
KK
2455 return rc;
2456}
2457
745e20f1 2458static DEFINE_PER_CPU(int, xmit_recursion);
11a766ce 2459#define RECURSION_LIMIT 10
745e20f1 2460
d29f749e
DJ
2461/**
2462 * dev_queue_xmit - transmit a buffer
2463 * @skb: buffer to transmit
2464 *
2465 * Queue a buffer for transmission to a network device. The caller must
2466 * have set the device and priority and built the buffer before calling
2467 * this function. The function can be called from an interrupt.
2468 *
2469 * A negative errno code is returned on a failure. A success does not
2470 * guarantee the frame will be transmitted as it may be dropped due
2471 * to congestion or traffic shaping.
2472 *
2473 * -----------------------------------------------------------------------------------
2474 * I notice this method can also return errors from the queue disciplines,
2475 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2476 * be positive.
2477 *
2478 * Regardless of the return value, the skb is consumed, so it is currently
2479 * difficult to retry a send to this method. (You can bump the ref count
2480 * before sending to hold a reference for retry if you are careful.)
2481 *
2482 * When calling this method, interrupts MUST be enabled. This is because
2483 * the BH enable code must have IRQs enabled so that it will not deadlock.
2484 * --BLG
2485 */
1da177e4
LT
2486int dev_queue_xmit(struct sk_buff *skb)
2487{
2488 struct net_device *dev = skb->dev;
dc2b4847 2489 struct netdev_queue *txq;
1da177e4
LT
2490 struct Qdisc *q;
2491 int rc = -ENOMEM;
2492
4ec93edb
YH
2493 /* Disable soft irqs for various locks below. Also
2494 * stops preemption for RCU.
1da177e4 2495 */
4ec93edb 2496 rcu_read_lock_bh();
1da177e4 2497
eae792b7 2498 txq = dev_pick_tx(dev, skb);
a898def2 2499 q = rcu_dereference_bh(txq->qdisc);
37437bb2 2500
1da177e4 2501#ifdef CONFIG_NET_CLS_ACT
d1b19dff 2502 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4 2503#endif
cf66ba58 2504 trace_net_dev_queue(skb);
1da177e4 2505 if (q->enqueue) {
bbd8a0d3 2506 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 2507 goto out;
1da177e4
LT
2508 }
2509
2510 /* The device has no queue. Common case for software devices:
2511 loopback, all the sorts of tunnels...
2512
932ff279
HX
2513 Really, it is unlikely that netif_tx_lock protection is necessary
2514 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
2515 counters.)
2516 However, it is possible, that they rely on protection
2517 made by us here.
2518
2519 Check this and shot the lock. It is not prone from deadlocks.
2520 Either shot noqueue qdisc, it is even simpler 8)
2521 */
2522 if (dev->flags & IFF_UP) {
2523 int cpu = smp_processor_id(); /* ok because BHs are off */
2524
c773e847 2525 if (txq->xmit_lock_owner != cpu) {
1da177e4 2526
745e20f1
ED
2527 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2528 goto recursion_alert;
2529
c773e847 2530 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 2531
fd2ea0a7 2532 if (!netif_tx_queue_stopped(txq)) {
745e20f1 2533 __this_cpu_inc(xmit_recursion);
572a9d7b 2534 rc = dev_hard_start_xmit(skb, dev, txq);
745e20f1 2535 __this_cpu_dec(xmit_recursion);
572a9d7b 2536 if (dev_xmit_complete(rc)) {
c773e847 2537 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2538 goto out;
2539 }
2540 }
c773e847 2541 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2542 if (net_ratelimit())
2543 printk(KERN_CRIT "Virtual device %s asks to "
2544 "queue packet!\n", dev->name);
2545 } else {
2546 /* Recursion is detected! It is possible,
745e20f1
ED
2547 * unfortunately
2548 */
2549recursion_alert:
1da177e4
LT
2550 if (net_ratelimit())
2551 printk(KERN_CRIT "Dead loop on virtual device "
2552 "%s, fix it urgently!\n", dev->name);
2553 }
2554 }
2555
2556 rc = -ENETDOWN;
d4828d85 2557 rcu_read_unlock_bh();
1da177e4 2558
1da177e4
LT
2559 kfree_skb(skb);
2560 return rc;
2561out:
d4828d85 2562 rcu_read_unlock_bh();
1da177e4
LT
2563 return rc;
2564}
d1b19dff 2565EXPORT_SYMBOL(dev_queue_xmit);
1da177e4
LT
2566
2567
2568/*=======================================================================
2569 Receiver routines
2570 =======================================================================*/
2571
6b2bedc3 2572int netdev_max_backlog __read_mostly = 1000;
3b098e2d 2573int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
2574int netdev_budget __read_mostly = 300;
2575int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 2576
eecfd7c4
ED
2577/* Called with irq disabled */
2578static inline void ____napi_schedule(struct softnet_data *sd,
2579 struct napi_struct *napi)
2580{
2581 list_add_tail(&napi->poll_list, &sd->poll_list);
2582 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2583}
2584
0a9627f2 2585/*
bfb564e7 2586 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
bdeab991
TH
2587 * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
2588 * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
2589 * if hash is a canonical 4-tuple hash over transport ports.
0a9627f2 2590 */
bdeab991 2591void __skb_get_rxhash(struct sk_buff *skb)
0a9627f2 2592{
12fcdefb 2593 int nhoff, hash = 0, poff;
b71d1d42
ED
2594 const struct ipv6hdr *ip6;
2595 const struct iphdr *ip;
1ff1986f 2596 const struct vlan_hdr *vlan;
0a9627f2 2597 u8 ip_proto;
792df22c
TH
2598 u32 addr1, addr2;
2599 u16 proto;
8c52d509
CG
2600 union {
2601 u32 v32;
2602 u16 v16[2];
2603 } ports;
0a9627f2 2604
bfb564e7 2605 nhoff = skb_network_offset(skb);
792df22c 2606 proto = skb->protocol;
0a9627f2 2607
e971b722 2608again:
792df22c 2609 switch (proto) {
0a9627f2 2610 case __constant_htons(ETH_P_IP):
5dd17e08 2611ip:
bfb564e7 2612 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
0a9627f2
TH
2613 goto done;
2614
b71d1d42 2615 ip = (const struct iphdr *) (skb->data + nhoff);
56f8a75c 2616 if (ip_is_fragment(ip))
dbe5775b
CG
2617 ip_proto = 0;
2618 else
2619 ip_proto = ip->protocol;
b249dcb8
ED
2620 addr1 = (__force u32) ip->saddr;
2621 addr2 = (__force u32) ip->daddr;
792df22c 2622 nhoff += ip->ihl * 4;
0a9627f2
TH
2623 break;
2624 case __constant_htons(ETH_P_IPV6):
5dd17e08 2625ipv6:
bfb564e7 2626 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
0a9627f2
TH
2627 goto done;
2628
b71d1d42 2629 ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
0a9627f2 2630 ip_proto = ip6->nexthdr;
b249dcb8
ED
2631 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2632 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
792df22c 2633 nhoff += 40;
0a9627f2 2634 break;
1ff1986f
CG
2635 case __constant_htons(ETH_P_8021Q):
2636 if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
2637 goto done;
2638 vlan = (const struct vlan_hdr *) (skb->data + nhoff);
2639 proto = vlan->h_vlan_encapsulated_proto;
2640 nhoff += sizeof(*vlan);
2641 goto again;
ae1511bf
CG
2642 case __constant_htons(ETH_P_PPP_SES):
2643 if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
2644 goto done;
2645 proto = *((__be16 *) (skb->data + nhoff +
2646 sizeof(struct pppoe_hdr)));
2647 nhoff += PPPOE_SES_HLEN;
5dd17e08
CG
2648 switch (proto) {
2649 case __constant_htons(PPP_IP):
2650 goto ip;
2651 case __constant_htons(PPP_IPV6):
2652 goto ipv6;
2653 default:
2654 goto done;
2655 }
0a9627f2
TH
2656 default:
2657 goto done;
2658 }
bfb564e7 2659
e971b722 2660 switch (ip_proto) {
c6865cb3
TH
2661 case IPPROTO_GRE:
2662 if (pskb_may_pull(skb, nhoff + 16)) {
2663 u8 *h = skb->data + nhoff;
2664 __be16 flags = *(__be16 *)h;
2665
2666 /*
2667 * Only look inside GRE if version zero and no
2668 * routing
2669 */
2670 if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
2671 proto = *(__be16 *)(h + 2);
2672 nhoff += 4;
2673 if (flags & GRE_CSUM)
2674 nhoff += 4;
2675 if (flags & GRE_KEY)
2676 nhoff += 4;
2677 if (flags & GRE_SEQ)
2678 nhoff += 4;
2679 goto again;
2680 }
2681 }
2682 break;
ec5efe79
ED
2683 case IPPROTO_IPIP:
2684 goto again;
e971b722
TH
2685 default:
2686 break;
2687 }
2688
12fcdefb
CG
2689 ports.v32 = 0;
2690 poff = proto_ports_offset(ip_proto);
2691 if (poff >= 0) {
792df22c 2692 nhoff += poff;
12fcdefb
CG
2693 if (pskb_may_pull(skb, nhoff + 4)) {
2694 ports.v32 = * (__force u32 *) (skb->data + nhoff);
8c52d509
CG
2695 if (ports.v16[1] < ports.v16[0])
2696 swap(ports.v16[0], ports.v16[1]);
bdeab991 2697 skb->l4_rxhash = 1;
b249dcb8 2698 }
0a9627f2
TH
2699 }
2700
b249dcb8
ED
2701 /* get a consistent hash (same value on both flow directions) */
2702 if (addr2 < addr1)
2703 swap(addr1, addr2);
0a9627f2 2704
bfb564e7
KK
2705 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2706 if (!hash)
2707 hash = 1;
2708
2709done:
bdeab991 2710 skb->rxhash = hash;
bfb564e7
KK
2711}
2712EXPORT_SYMBOL(__skb_get_rxhash);
2713
2714#ifdef CONFIG_RPS
2715
2716/* One global table that all flow-based protocols share. */
6e3f7faf 2717struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
bfb564e7
KK
2718EXPORT_SYMBOL(rps_sock_flow_table);
2719
c445477d
BH
2720static struct rps_dev_flow *
2721set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2722 struct rps_dev_flow *rflow, u16 next_cpu)
2723{
09994d1b 2724 if (next_cpu != RPS_NO_CPU) {
c445477d
BH
2725#ifdef CONFIG_RFS_ACCEL
2726 struct netdev_rx_queue *rxqueue;
2727 struct rps_dev_flow_table *flow_table;
2728 struct rps_dev_flow *old_rflow;
2729 u32 flow_id;
2730 u16 rxq_index;
2731 int rc;
2732
2733 /* Should we steer this flow to a different hardware queue? */
69a19ee6
BH
2734 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2735 !(dev->features & NETIF_F_NTUPLE))
c445477d
BH
2736 goto out;
2737 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2738 if (rxq_index == skb_get_rx_queue(skb))
2739 goto out;
2740
2741 rxqueue = dev->_rx + rxq_index;
2742 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2743 if (!flow_table)
2744 goto out;
2745 flow_id = skb->rxhash & flow_table->mask;
2746 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2747 rxq_index, flow_id);
2748 if (rc < 0)
2749 goto out;
2750 old_rflow = rflow;
2751 rflow = &flow_table->flows[flow_id];
c445477d
BH
2752 rflow->filter = rc;
2753 if (old_rflow->filter == rflow->filter)
2754 old_rflow->filter = RPS_NO_FILTER;
2755 out:
2756#endif
2757 rflow->last_qtail =
09994d1b 2758 per_cpu(softnet_data, next_cpu).input_queue_head;
c445477d
BH
2759 }
2760
09994d1b 2761 rflow->cpu = next_cpu;
c445477d
BH
2762 return rflow;
2763}
2764
bfb564e7
KK
2765/*
2766 * get_rps_cpu is called from netif_receive_skb and returns the target
2767 * CPU from the RPS map of the receiving queue for a given skb.
2768 * rcu_read_lock must be held on entry.
2769 */
2770static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2771 struct rps_dev_flow **rflowp)
2772{
2773 struct netdev_rx_queue *rxqueue;
6e3f7faf 2774 struct rps_map *map;
bfb564e7
KK
2775 struct rps_dev_flow_table *flow_table;
2776 struct rps_sock_flow_table *sock_flow_table;
2777 int cpu = -1;
2778 u16 tcpu;
2779
2780 if (skb_rx_queue_recorded(skb)) {
2781 u16 index = skb_get_rx_queue(skb);
62fe0b40
BH
2782 if (unlikely(index >= dev->real_num_rx_queues)) {
2783 WARN_ONCE(dev->real_num_rx_queues > 1,
2784 "%s received packet on queue %u, but number "
2785 "of RX queues is %u\n",
2786 dev->name, index, dev->real_num_rx_queues);
bfb564e7
KK
2787 goto done;
2788 }
2789 rxqueue = dev->_rx + index;
2790 } else
2791 rxqueue = dev->_rx;
2792
6e3f7faf
ED
2793 map = rcu_dereference(rxqueue->rps_map);
2794 if (map) {
85875236 2795 if (map->len == 1 &&
33d480ce 2796 !rcu_access_pointer(rxqueue->rps_flow_table)) {
6febfca9
CG
2797 tcpu = map->cpus[0];
2798 if (cpu_online(tcpu))
2799 cpu = tcpu;
2800 goto done;
2801 }
33d480ce 2802 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
bfb564e7 2803 goto done;
6febfca9 2804 }
bfb564e7 2805
2d47b459 2806 skb_reset_network_header(skb);
bfb564e7
KK
2807 if (!skb_get_rxhash(skb))
2808 goto done;
2809
fec5e652
TH
2810 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2811 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2812 if (flow_table && sock_flow_table) {
2813 u16 next_cpu;
2814 struct rps_dev_flow *rflow;
2815
2816 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2817 tcpu = rflow->cpu;
2818
2819 next_cpu = sock_flow_table->ents[skb->rxhash &
2820 sock_flow_table->mask];
2821
2822 /*
2823 * If the desired CPU (where last recvmsg was done) is
2824 * different from current CPU (one in the rx-queue flow
2825 * table entry), switch if one of the following holds:
2826 * - Current CPU is unset (equal to RPS_NO_CPU).
2827 * - Current CPU is offline.
2828 * - The current CPU's queue tail has advanced beyond the
2829 * last packet that was enqueued using this table entry.
2830 * This guarantees that all previous packets for the flow
2831 * have been dequeued, thus preserving in order delivery.
2832 */
2833 if (unlikely(tcpu != next_cpu) &&
2834 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2835 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
c445477d
BH
2836 rflow->last_qtail)) >= 0))
2837 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2838
fec5e652
TH
2839 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2840 *rflowp = rflow;
2841 cpu = tcpu;
2842 goto done;
2843 }
2844 }
2845
0a9627f2 2846 if (map) {
fec5e652 2847 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
0a9627f2
TH
2848
2849 if (cpu_online(tcpu)) {
2850 cpu = tcpu;
2851 goto done;
2852 }
2853 }
2854
2855done:
0a9627f2
TH
2856 return cpu;
2857}
2858
c445477d
BH
2859#ifdef CONFIG_RFS_ACCEL
2860
2861/**
2862 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2863 * @dev: Device on which the filter was set
2864 * @rxq_index: RX queue index
2865 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2866 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2867 *
2868 * Drivers that implement ndo_rx_flow_steer() should periodically call
2869 * this function for each installed filter and remove the filters for
2870 * which it returns %true.
2871 */
2872bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2873 u32 flow_id, u16 filter_id)
2874{
2875 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2876 struct rps_dev_flow_table *flow_table;
2877 struct rps_dev_flow *rflow;
2878 bool expire = true;
2879 int cpu;
2880
2881 rcu_read_lock();
2882 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2883 if (flow_table && flow_id <= flow_table->mask) {
2884 rflow = &flow_table->flows[flow_id];
2885 cpu = ACCESS_ONCE(rflow->cpu);
2886 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2887 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2888 rflow->last_qtail) <
2889 (int)(10 * flow_table->mask)))
2890 expire = false;
2891 }
2892 rcu_read_unlock();
2893 return expire;
2894}
2895EXPORT_SYMBOL(rps_may_expire_flow);
2896
2897#endif /* CONFIG_RFS_ACCEL */
2898
0a9627f2 2899/* Called from hardirq (IPI) context */
e36fa2f7 2900static void rps_trigger_softirq(void *data)
0a9627f2 2901{
e36fa2f7
ED
2902 struct softnet_data *sd = data;
2903
eecfd7c4 2904 ____napi_schedule(sd, &sd->backlog);
dee42870 2905 sd->received_rps++;
0a9627f2 2906}
e36fa2f7 2907
fec5e652 2908#endif /* CONFIG_RPS */
0a9627f2 2909
e36fa2f7
ED
2910/*
2911 * Check if this softnet_data structure is another cpu one
2912 * If yes, queue it to our IPI list and return 1
2913 * If no, return 0
2914 */
2915static int rps_ipi_queued(struct softnet_data *sd)
2916{
2917#ifdef CONFIG_RPS
2918 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2919
2920 if (sd != mysd) {
2921 sd->rps_ipi_next = mysd->rps_ipi_list;
2922 mysd->rps_ipi_list = sd;
2923
2924 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2925 return 1;
2926 }
2927#endif /* CONFIG_RPS */
2928 return 0;
2929}
2930
0a9627f2
TH
2931/*
2932 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2933 * queue (may be a remote CPU queue).
2934 */
fec5e652
TH
2935static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2936 unsigned int *qtail)
0a9627f2 2937{
e36fa2f7 2938 struct softnet_data *sd;
0a9627f2
TH
2939 unsigned long flags;
2940
e36fa2f7 2941 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
2942
2943 local_irq_save(flags);
0a9627f2 2944
e36fa2f7 2945 rps_lock(sd);
6e7676c1
CG
2946 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2947 if (skb_queue_len(&sd->input_pkt_queue)) {
0a9627f2 2948enqueue:
e36fa2f7 2949 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 2950 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 2951 rps_unlock(sd);
152102c7 2952 local_irq_restore(flags);
0a9627f2
TH
2953 return NET_RX_SUCCESS;
2954 }
2955
ebda37c2
ED
2956 /* Schedule NAPI for backlog device
2957 * We can use non atomic operation since we own the queue lock
2958 */
2959 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 2960 if (!rps_ipi_queued(sd))
eecfd7c4 2961 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
2962 }
2963 goto enqueue;
2964 }
2965
dee42870 2966 sd->dropped++;
e36fa2f7 2967 rps_unlock(sd);
0a9627f2 2968
0a9627f2
TH
2969 local_irq_restore(flags);
2970
caf586e5 2971 atomic_long_inc(&skb->dev->rx_dropped);
0a9627f2
TH
2972 kfree_skb(skb);
2973 return NET_RX_DROP;
2974}
1da177e4 2975
1da177e4
LT
2976/**
2977 * netif_rx - post buffer to the network code
2978 * @skb: buffer to post
2979 *
2980 * This function receives a packet from a device driver and queues it for
2981 * the upper (protocol) levels to process. It always succeeds. The buffer
2982 * may be dropped during processing for congestion control or by the
2983 * protocol layers.
2984 *
2985 * return values:
2986 * NET_RX_SUCCESS (no congestion)
1da177e4
LT
2987 * NET_RX_DROP (packet was dropped)
2988 *
2989 */
2990
2991int netif_rx(struct sk_buff *skb)
2992{
b0e28f1e 2993 int ret;
1da177e4
LT
2994
2995 /* if netpoll wants it, pretend we never saw it */
2996 if (netpoll_rx(skb))
2997 return NET_RX_DROP;
2998
588f0330 2999 net_timestamp_check(netdev_tstamp_prequeue, skb);
1da177e4 3000
cf66ba58 3001 trace_netif_rx(skb);
df334545 3002#ifdef CONFIG_RPS
b0e28f1e 3003 {
fec5e652 3004 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
3005 int cpu;
3006
cece1945 3007 preempt_disable();
b0e28f1e 3008 rcu_read_lock();
fec5e652
TH
3009
3010 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
3011 if (cpu < 0)
3012 cpu = smp_processor_id();
fec5e652
TH
3013
3014 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3015
b0e28f1e 3016 rcu_read_unlock();
cece1945 3017 preempt_enable();
b0e28f1e 3018 }
1e94d72f 3019#else
fec5e652
TH
3020 {
3021 unsigned int qtail;
3022 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3023 put_cpu();
3024 }
1e94d72f 3025#endif
b0e28f1e 3026 return ret;
1da177e4 3027}
d1b19dff 3028EXPORT_SYMBOL(netif_rx);
1da177e4
LT
3029
3030int netif_rx_ni(struct sk_buff *skb)
3031{
3032 int err;
3033
3034 preempt_disable();
3035 err = netif_rx(skb);
3036 if (local_softirq_pending())
3037 do_softirq();
3038 preempt_enable();
3039
3040 return err;
3041}
1da177e4
LT
3042EXPORT_SYMBOL(netif_rx_ni);
3043
1da177e4
LT
3044static void net_tx_action(struct softirq_action *h)
3045{
3046 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3047
3048 if (sd->completion_queue) {
3049 struct sk_buff *clist;
3050
3051 local_irq_disable();
3052 clist = sd->completion_queue;
3053 sd->completion_queue = NULL;
3054 local_irq_enable();
3055
3056 while (clist) {
3057 struct sk_buff *skb = clist;
3058 clist = clist->next;
3059
547b792c 3060 WARN_ON(atomic_read(&skb->users));
07dc22e7 3061 trace_kfree_skb(skb, net_tx_action);
1da177e4
LT
3062 __kfree_skb(skb);
3063 }
3064 }
3065
3066 if (sd->output_queue) {
37437bb2 3067 struct Qdisc *head;
1da177e4
LT
3068
3069 local_irq_disable();
3070 head = sd->output_queue;
3071 sd->output_queue = NULL;
a9cbd588 3072 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
3073 local_irq_enable();
3074
3075 while (head) {
37437bb2
DM
3076 struct Qdisc *q = head;
3077 spinlock_t *root_lock;
3078
1da177e4
LT
3079 head = head->next_sched;
3080
5fb66229 3081 root_lock = qdisc_lock(q);
37437bb2 3082 if (spin_trylock(root_lock)) {
def82a1d
JP
3083 smp_mb__before_clear_bit();
3084 clear_bit(__QDISC_STATE_SCHED,
3085 &q->state);
37437bb2
DM
3086 qdisc_run(q);
3087 spin_unlock(root_lock);
1da177e4 3088 } else {
195648bb 3089 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 3090 &q->state)) {
195648bb 3091 __netif_reschedule(q);
e8a83e10
JP
3092 } else {
3093 smp_mb__before_clear_bit();
3094 clear_bit(__QDISC_STATE_SCHED,
3095 &q->state);
3096 }
1da177e4
LT
3097 }
3098 }
3099 }
3100}
3101
ab95bfe0
JP
3102#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3103 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
da678292
MM
3104/* This hook is defined here for ATM LANE */
3105int (*br_fdb_test_addr_hook)(struct net_device *dev,
3106 unsigned char *addr) __read_mostly;
4fb019a0 3107EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 3108#endif
1da177e4 3109
1da177e4
LT
3110#ifdef CONFIG_NET_CLS_ACT
3111/* TODO: Maybe we should just force sch_ingress to be compiled in
3112 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3113 * a compare and 2 stores extra right now if we dont have it on
3114 * but have CONFIG_NET_CLS_ACT
25985edc
LDM
3115 * NOTE: This doesn't stop any functionality; if you dont have
3116 * the ingress scheduler, you just can't add policies on ingress.
1da177e4
LT
3117 *
3118 */
24824a09 3119static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
1da177e4 3120{
1da177e4 3121 struct net_device *dev = skb->dev;
f697c3e8 3122 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
3123 int result = TC_ACT_OK;
3124 struct Qdisc *q;
4ec93edb 3125
de384830
SH
3126 if (unlikely(MAX_RED_LOOP < ttl++)) {
3127 if (net_ratelimit())
3128 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3129 skb->skb_iif, dev->ifindex);
f697c3e8
HX
3130 return TC_ACT_SHOT;
3131 }
1da177e4 3132
f697c3e8
HX
3133 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3134 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 3135
83874000 3136 q = rxq->qdisc;
8d50b53d 3137 if (q != &noop_qdisc) {
83874000 3138 spin_lock(qdisc_lock(q));
a9312ae8
DM
3139 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3140 result = qdisc_enqueue_root(skb, q);
83874000
DM
3141 spin_unlock(qdisc_lock(q));
3142 }
f697c3e8
HX
3143
3144 return result;
3145}
86e65da9 3146
f697c3e8
HX
3147static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3148 struct packet_type **pt_prev,
3149 int *ret, struct net_device *orig_dev)
3150{
24824a09
ED
3151 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3152
3153 if (!rxq || rxq->qdisc == &noop_qdisc)
f697c3e8 3154 goto out;
1da177e4 3155
f697c3e8
HX
3156 if (*pt_prev) {
3157 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3158 *pt_prev = NULL;
1da177e4
LT
3159 }
3160
24824a09 3161 switch (ing_filter(skb, rxq)) {
f697c3e8
HX
3162 case TC_ACT_SHOT:
3163 case TC_ACT_STOLEN:
3164 kfree_skb(skb);
3165 return NULL;
3166 }
3167
3168out:
3169 skb->tc_verd = 0;
3170 return skb;
1da177e4
LT
3171}
3172#endif
3173
ab95bfe0
JP
3174/**
3175 * netdev_rx_handler_register - register receive handler
3176 * @dev: device to register a handler for
3177 * @rx_handler: receive handler to register
93e2c32b 3178 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0
JP
3179 *
3180 * Register a receive hander for a device. This handler will then be
3181 * called from __netif_receive_skb. A negative errno code is returned
3182 * on a failure.
3183 *
3184 * The caller must hold the rtnl_mutex.
8a4eb573
JP
3185 *
3186 * For a general description of rx_handler, see enum rx_handler_result.
ab95bfe0
JP
3187 */
3188int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
3189 rx_handler_func_t *rx_handler,
3190 void *rx_handler_data)
ab95bfe0
JP
3191{
3192 ASSERT_RTNL();
3193
3194 if (dev->rx_handler)
3195 return -EBUSY;
3196
93e2c32b 3197 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
3198 rcu_assign_pointer(dev->rx_handler, rx_handler);
3199
3200 return 0;
3201}
3202EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3203
3204/**
3205 * netdev_rx_handler_unregister - unregister receive handler
3206 * @dev: device to unregister a handler from
3207 *
3208 * Unregister a receive hander from a device.
3209 *
3210 * The caller must hold the rtnl_mutex.
3211 */
3212void netdev_rx_handler_unregister(struct net_device *dev)
3213{
3214
3215 ASSERT_RTNL();
a9b3cd7f
SH
3216 RCU_INIT_POINTER(dev->rx_handler, NULL);
3217 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
ab95bfe0
JP
3218}
3219EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3220
10f744d2 3221static int __netif_receive_skb(struct sk_buff *skb)
1da177e4
LT
3222{
3223 struct packet_type *ptype, *pt_prev;
ab95bfe0 3224 rx_handler_func_t *rx_handler;
f2ccd8fa 3225 struct net_device *orig_dev;
63d8ea7f 3226 struct net_device *null_or_dev;
8a4eb573 3227 bool deliver_exact = false;
1da177e4 3228 int ret = NET_RX_DROP;
252e3346 3229 __be16 type;
1da177e4 3230
588f0330 3231 net_timestamp_check(!netdev_tstamp_prequeue, skb);
81bbb3d4 3232
cf66ba58 3233 trace_netif_receive_skb(skb);
9b22ea56 3234
1da177e4 3235 /* if we've gotten here through NAPI, check netpoll */
bea3348e 3236 if (netpoll_receive_skb(skb))
1da177e4
LT
3237 return NET_RX_DROP;
3238
8964be4a
ED
3239 if (!skb->skb_iif)
3240 skb->skb_iif = skb->dev->ifindex;
cc9bd5ce 3241 orig_dev = skb->dev;
8f903c70 3242
c1d2bbe1 3243 skb_reset_network_header(skb);
badff6d0 3244 skb_reset_transport_header(skb);
0b5c9db1 3245 skb_reset_mac_len(skb);
1da177e4
LT
3246
3247 pt_prev = NULL;
3248
3249 rcu_read_lock();
3250
63d8ea7f
DM
3251another_round:
3252
3253 __this_cpu_inc(softnet_data.processed);
3254
bcc6d479
JP
3255 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3256 skb = vlan_untag(skb);
3257 if (unlikely(!skb))
3258 goto out;
3259 }
3260
1da177e4
LT
3261#ifdef CONFIG_NET_CLS_ACT
3262 if (skb->tc_verd & TC_NCLS) {
3263 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3264 goto ncls;
3265 }
3266#endif
3267
3268 list_for_each_entry_rcu(ptype, &ptype_all, list) {
63d8ea7f 3269 if (!ptype->dev || ptype->dev == skb->dev) {
4ec93edb 3270 if (pt_prev)
f2ccd8fa 3271 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3272 pt_prev = ptype;
3273 }
3274 }
3275
3276#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
3277 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3278 if (!skb)
1da177e4 3279 goto out;
1da177e4
LT
3280ncls:
3281#endif
3282
6a32e4f9 3283 rx_handler = rcu_dereference(skb->dev->rx_handler);
2425717b
JF
3284 if (vlan_tx_tag_present(skb)) {
3285 if (pt_prev) {
3286 ret = deliver_skb(skb, pt_prev, orig_dev);
3287 pt_prev = NULL;
3288 }
6a32e4f9 3289 if (vlan_do_receive(&skb, !rx_handler))
2425717b
JF
3290 goto another_round;
3291 else if (unlikely(!skb))
3292 goto out;
3293 }
3294
ab95bfe0
JP
3295 if (rx_handler) {
3296 if (pt_prev) {
3297 ret = deliver_skb(skb, pt_prev, orig_dev);
3298 pt_prev = NULL;
3299 }
8a4eb573
JP
3300 switch (rx_handler(&skb)) {
3301 case RX_HANDLER_CONSUMED:
ab95bfe0 3302 goto out;
8a4eb573 3303 case RX_HANDLER_ANOTHER:
63d8ea7f 3304 goto another_round;
8a4eb573
JP
3305 case RX_HANDLER_EXACT:
3306 deliver_exact = true;
3307 case RX_HANDLER_PASS:
3308 break;
3309 default:
3310 BUG();
3311 }
ab95bfe0 3312 }
1da177e4 3313
63d8ea7f 3314 /* deliver only exact match when indicated */
8a4eb573 3315 null_or_dev = deliver_exact ? skb->dev : NULL;
1f3c8804 3316
1da177e4 3317 type = skb->protocol;
82d8a867
PE
3318 list_for_each_entry_rcu(ptype,
3319 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
63d8ea7f 3320 if (ptype->type == type &&
e3f48d37
JP
3321 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3322 ptype->dev == orig_dev)) {
4ec93edb 3323 if (pt_prev)
f2ccd8fa 3324 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3325 pt_prev = ptype;
3326 }
3327 }
3328
3329 if (pt_prev) {
f2ccd8fa 3330 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4 3331 } else {
caf586e5 3332 atomic_long_inc(&skb->dev->rx_dropped);
1da177e4
LT
3333 kfree_skb(skb);
3334 /* Jamal, now you will not able to escape explaining
3335 * me how you were going to use this. :-)
3336 */
3337 ret = NET_RX_DROP;
3338 }
3339
3340out:
3341 rcu_read_unlock();
3342 return ret;
3343}
0a9627f2
TH
3344
3345/**
3346 * netif_receive_skb - process receive buffer from network
3347 * @skb: buffer to process
3348 *
3349 * netif_receive_skb() is the main receive data processing function.
3350 * It always succeeds. The buffer may be dropped during processing
3351 * for congestion control or by the protocol layers.
3352 *
3353 * This function may only be called from softirq context and interrupts
3354 * should be enabled.
3355 *
3356 * Return values (usually ignored):
3357 * NET_RX_SUCCESS: no congestion
3358 * NET_RX_DROP: packet was dropped
3359 */
3360int netif_receive_skb(struct sk_buff *skb)
3361{
588f0330 3362 net_timestamp_check(netdev_tstamp_prequeue, skb);
3b098e2d 3363
c1f19b51
RC
3364 if (skb_defer_rx_timestamp(skb))
3365 return NET_RX_SUCCESS;
3366
df334545 3367#ifdef CONFIG_RPS
3b098e2d
ED
3368 {
3369 struct rps_dev_flow voidflow, *rflow = &voidflow;
3370 int cpu, ret;
fec5e652 3371
3b098e2d
ED
3372 rcu_read_lock();
3373
3374 cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 3375
3b098e2d
ED
3376 if (cpu >= 0) {
3377 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3378 rcu_read_unlock();
3379 } else {
3380 rcu_read_unlock();
3381 ret = __netif_receive_skb(skb);
3382 }
0a9627f2 3383
3b098e2d 3384 return ret;
fec5e652 3385 }
1e94d72f
TH
3386#else
3387 return __netif_receive_skb(skb);
3388#endif
0a9627f2 3389}
d1b19dff 3390EXPORT_SYMBOL(netif_receive_skb);
1da177e4 3391
88751275
ED
3392/* Network device is going away, flush any packets still pending
3393 * Called with irqs disabled.
3394 */
152102c7 3395static void flush_backlog(void *arg)
6e583ce5 3396{
152102c7 3397 struct net_device *dev = arg;
e36fa2f7 3398 struct softnet_data *sd = &__get_cpu_var(softnet_data);
6e583ce5
SH
3399 struct sk_buff *skb, *tmp;
3400
e36fa2f7 3401 rps_lock(sd);
6e7676c1 3402 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6e583ce5 3403 if (skb->dev == dev) {
e36fa2f7 3404 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 3405 kfree_skb(skb);
76cc8b13 3406 input_queue_head_incr(sd);
6e583ce5 3407 }
6e7676c1 3408 }
e36fa2f7 3409 rps_unlock(sd);
6e7676c1
CG
3410
3411 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3412 if (skb->dev == dev) {
3413 __skb_unlink(skb, &sd->process_queue);
3414 kfree_skb(skb);
76cc8b13 3415 input_queue_head_incr(sd);
6e7676c1
CG
3416 }
3417 }
6e583ce5
SH
3418}
3419
d565b0a1
HX
3420static int napi_gro_complete(struct sk_buff *skb)
3421{
3422 struct packet_type *ptype;
3423 __be16 type = skb->protocol;
3424 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3425 int err = -ENOENT;
3426
fc59f9a3
HX
3427 if (NAPI_GRO_CB(skb)->count == 1) {
3428 skb_shinfo(skb)->gso_size = 0;
d565b0a1 3429 goto out;
fc59f9a3 3430 }
d565b0a1
HX
3431
3432 rcu_read_lock();
3433 list_for_each_entry_rcu(ptype, head, list) {
3434 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3435 continue;
3436
3437 err = ptype->gro_complete(skb);
3438 break;
3439 }
3440 rcu_read_unlock();
3441
3442 if (err) {
3443 WARN_ON(&ptype->list == head);
3444 kfree_skb(skb);
3445 return NET_RX_SUCCESS;
3446 }
3447
3448out:
d565b0a1
HX
3449 return netif_receive_skb(skb);
3450}
3451
86cac58b 3452inline void napi_gro_flush(struct napi_struct *napi)
d565b0a1
HX
3453{
3454 struct sk_buff *skb, *next;
3455
3456 for (skb = napi->gro_list; skb; skb = next) {
3457 next = skb->next;
3458 skb->next = NULL;
3459 napi_gro_complete(skb);
3460 }
3461
4ae5544f 3462 napi->gro_count = 0;
d565b0a1
HX
3463 napi->gro_list = NULL;
3464}
86cac58b 3465EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 3466
5b252f0c 3467enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
3468{
3469 struct sk_buff **pp = NULL;
3470 struct packet_type *ptype;
3471 __be16 type = skb->protocol;
3472 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
0da2afd5 3473 int same_flow;
d565b0a1 3474 int mac_len;
5b252f0c 3475 enum gro_result ret;
d565b0a1 3476
ce9e76c8 3477 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
d565b0a1
HX
3478 goto normal;
3479
21dc3301 3480 if (skb_is_gso(skb) || skb_has_frag_list(skb))
f17f5c91
HX
3481 goto normal;
3482
d565b0a1
HX
3483 rcu_read_lock();
3484 list_for_each_entry_rcu(ptype, head, list) {
d565b0a1
HX
3485 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3486 continue;
3487
86911732 3488 skb_set_network_header(skb, skb_gro_offset(skb));
d565b0a1
HX
3489 mac_len = skb->network_header - skb->mac_header;
3490 skb->mac_len = mac_len;
3491 NAPI_GRO_CB(skb)->same_flow = 0;
3492 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 3493 NAPI_GRO_CB(skb)->free = 0;
d565b0a1 3494
d565b0a1
HX
3495 pp = ptype->gro_receive(&napi->gro_list, skb);
3496 break;
3497 }
3498 rcu_read_unlock();
3499
3500 if (&ptype->list == head)
3501 goto normal;
3502
0da2afd5 3503 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 3504 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 3505
d565b0a1
HX
3506 if (pp) {
3507 struct sk_buff *nskb = *pp;
3508
3509 *pp = nskb->next;
3510 nskb->next = NULL;
3511 napi_gro_complete(nskb);
4ae5544f 3512 napi->gro_count--;
d565b0a1
HX
3513 }
3514
0da2afd5 3515 if (same_flow)
d565b0a1
HX
3516 goto ok;
3517
4ae5544f 3518 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
d565b0a1 3519 goto normal;
d565b0a1 3520
4ae5544f 3521 napi->gro_count++;
d565b0a1 3522 NAPI_GRO_CB(skb)->count = 1;
86911732 3523 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
3524 skb->next = napi->gro_list;
3525 napi->gro_list = skb;
5d0d9be8 3526 ret = GRO_HELD;
d565b0a1 3527
ad0f9904 3528pull:
cb18978c
HX
3529 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3530 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3531
3532 BUG_ON(skb->end - skb->tail < grow);
3533
3534 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3535
3536 skb->tail += grow;
3537 skb->data_len -= grow;
3538
3539 skb_shinfo(skb)->frags[0].page_offset += grow;
9e903e08 3540 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
cb18978c 3541
9e903e08 3542 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
ea2ab693 3543 skb_frag_unref(skb, 0);
cb18978c
HX
3544 memmove(skb_shinfo(skb)->frags,
3545 skb_shinfo(skb)->frags + 1,
e5093aec 3546 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
cb18978c 3547 }
ad0f9904
HX
3548 }
3549
d565b0a1 3550ok:
5d0d9be8 3551 return ret;
d565b0a1
HX
3552
3553normal:
ad0f9904
HX
3554 ret = GRO_NORMAL;
3555 goto pull;
5d38a079 3556}
96e93eab
HX
3557EXPORT_SYMBOL(dev_gro_receive);
3558
40d0802b 3559static inline gro_result_t
5b252f0c 3560__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
96e93eab
HX
3561{
3562 struct sk_buff *p;
3563
3564 for (p = napi->gro_list; p; p = p->next) {
40d0802b
ED
3565 unsigned long diffs;
3566
3567 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3701e513 3568 diffs |= p->vlan_tci ^ skb->vlan_tci;
40d0802b 3569 diffs |= compare_ether_header(skb_mac_header(p),
f64f9e71 3570 skb_gro_mac_header(skb));
40d0802b 3571 NAPI_GRO_CB(p)->same_flow = !diffs;
96e93eab
HX
3572 NAPI_GRO_CB(p)->flush = 0;
3573 }
3574
3575 return dev_gro_receive(napi, skb);
3576}
5d38a079 3577
c7c4b3b6 3578gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 3579{
5d0d9be8
HX
3580 switch (ret) {
3581 case GRO_NORMAL:
c7c4b3b6
BH
3582 if (netif_receive_skb(skb))
3583 ret = GRO_DROP;
3584 break;
5d38a079 3585
5d0d9be8 3586 case GRO_DROP:
5d0d9be8 3587 case GRO_MERGED_FREE:
5d38a079
HX
3588 kfree_skb(skb);
3589 break;
5b252f0c
BH
3590
3591 case GRO_HELD:
3592 case GRO_MERGED:
3593 break;
5d38a079
HX
3594 }
3595
c7c4b3b6 3596 return ret;
5d0d9be8
HX
3597}
3598EXPORT_SYMBOL(napi_skb_finish);
3599
78a478d0
HX
3600void skb_gro_reset_offset(struct sk_buff *skb)
3601{
3602 NAPI_GRO_CB(skb)->data_offset = 0;
3603 NAPI_GRO_CB(skb)->frag0 = NULL;
7489594c 3604 NAPI_GRO_CB(skb)->frag0_len = 0;
78a478d0 3605
78d3fd0b 3606 if (skb->mac_header == skb->tail &&
ea2ab693 3607 !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
78a478d0 3608 NAPI_GRO_CB(skb)->frag0 =
ea2ab693 3609 skb_frag_address(&skb_shinfo(skb)->frags[0]);
9e903e08 3610 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
7489594c 3611 }
78a478d0
HX
3612}
3613EXPORT_SYMBOL(skb_gro_reset_offset);
3614
c7c4b3b6 3615gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 3616{
86911732
HX
3617 skb_gro_reset_offset(skb);
3618
5d0d9be8 3619 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
d565b0a1
HX
3620}
3621EXPORT_SYMBOL(napi_gro_receive);
3622
d0c2b0d2 3623static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
96e93eab 3624{
96e93eab
HX
3625 __skb_pull(skb, skb_headlen(skb));
3626 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3701e513 3627 skb->vlan_tci = 0;
66c46d74 3628 skb->dev = napi->dev;
6d152e23 3629 skb->skb_iif = 0;
96e93eab
HX
3630
3631 napi->skb = skb;
3632}
96e93eab 3633
76620aaf 3634struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 3635{
5d38a079 3636 struct sk_buff *skb = napi->skb;
5d38a079
HX
3637
3638 if (!skb) {
89d71a66
ED
3639 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3640 if (skb)
3641 napi->skb = skb;
80595d59 3642 }
96e93eab
HX
3643 return skb;
3644}
76620aaf 3645EXPORT_SYMBOL(napi_get_frags);
96e93eab 3646
c7c4b3b6
BH
3647gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3648 gro_result_t ret)
96e93eab 3649{
5d0d9be8
HX
3650 switch (ret) {
3651 case GRO_NORMAL:
86911732 3652 case GRO_HELD:
e76b69cc 3653 skb->protocol = eth_type_trans(skb, skb->dev);
86911732 3654
c7c4b3b6
BH
3655 if (ret == GRO_HELD)
3656 skb_gro_pull(skb, -ETH_HLEN);
3657 else if (netif_receive_skb(skb))
3658 ret = GRO_DROP;
86911732 3659 break;
5d38a079 3660
5d0d9be8 3661 case GRO_DROP:
5d0d9be8
HX
3662 case GRO_MERGED_FREE:
3663 napi_reuse_skb(napi, skb);
3664 break;
5b252f0c
BH
3665
3666 case GRO_MERGED:
3667 break;
5d0d9be8 3668 }
5d38a079 3669
c7c4b3b6 3670 return ret;
5d38a079 3671}
5d0d9be8
HX
3672EXPORT_SYMBOL(napi_frags_finish);
3673
76620aaf
HX
3674struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3675{
3676 struct sk_buff *skb = napi->skb;
3677 struct ethhdr *eth;
a5b1cf28
HX
3678 unsigned int hlen;
3679 unsigned int off;
76620aaf
HX
3680
3681 napi->skb = NULL;
3682
3683 skb_reset_mac_header(skb);
3684 skb_gro_reset_offset(skb);
3685
a5b1cf28
HX
3686 off = skb_gro_offset(skb);
3687 hlen = off + sizeof(*eth);
3688 eth = skb_gro_header_fast(skb, off);
3689 if (skb_gro_header_hard(skb, hlen)) {
3690 eth = skb_gro_header_slow(skb, hlen, off);
3691 if (unlikely(!eth)) {
3692 napi_reuse_skb(napi, skb);
3693 skb = NULL;
3694 goto out;
3695 }
76620aaf
HX
3696 }
3697
3698 skb_gro_pull(skb, sizeof(*eth));
3699
3700 /*
3701 * This works because the only protocols we care about don't require
3702 * special handling. We'll fix it up properly at the end.
3703 */
3704 skb->protocol = eth->h_proto;
3705
3706out:
3707 return skb;
3708}
3709EXPORT_SYMBOL(napi_frags_skb);
3710
c7c4b3b6 3711gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 3712{
76620aaf 3713 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
3714
3715 if (!skb)
c7c4b3b6 3716 return GRO_DROP;
5d0d9be8
HX
3717
3718 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3719}
5d38a079
HX
3720EXPORT_SYMBOL(napi_gro_frags);
3721
e326bed2
ED
3722/*
3723 * net_rps_action sends any pending IPI's for rps.
3724 * Note: called with local irq disabled, but exits with local irq enabled.
3725 */
3726static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3727{
3728#ifdef CONFIG_RPS
3729 struct softnet_data *remsd = sd->rps_ipi_list;
3730
3731 if (remsd) {
3732 sd->rps_ipi_list = NULL;
3733
3734 local_irq_enable();
3735
3736 /* Send pending IPI's to kick RPS processing on remote cpus. */
3737 while (remsd) {
3738 struct softnet_data *next = remsd->rps_ipi_next;
3739
3740 if (cpu_online(remsd->cpu))
3741 __smp_call_function_single(remsd->cpu,
3742 &remsd->csd, 0);
3743 remsd = next;
3744 }
3745 } else
3746#endif
3747 local_irq_enable();
3748}
3749
bea3348e 3750static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
3751{
3752 int work = 0;
eecfd7c4 3753 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
1da177e4 3754
e326bed2
ED
3755#ifdef CONFIG_RPS
3756 /* Check if we have pending ipi, its better to send them now,
3757 * not waiting net_rx_action() end.
3758 */
3759 if (sd->rps_ipi_list) {
3760 local_irq_disable();
3761 net_rps_action_and_irq_enable(sd);
3762 }
3763#endif
bea3348e 3764 napi->weight = weight_p;
6e7676c1
CG
3765 local_irq_disable();
3766 while (work < quota) {
1da177e4 3767 struct sk_buff *skb;
6e7676c1
CG
3768 unsigned int qlen;
3769
3770 while ((skb = __skb_dequeue(&sd->process_queue))) {
3771 local_irq_enable();
3772 __netif_receive_skb(skb);
6e7676c1 3773 local_irq_disable();
76cc8b13
TH
3774 input_queue_head_incr(sd);
3775 if (++work >= quota) {
3776 local_irq_enable();
3777 return work;
3778 }
6e7676c1 3779 }
1da177e4 3780
e36fa2f7 3781 rps_lock(sd);
6e7676c1 3782 qlen = skb_queue_len(&sd->input_pkt_queue);
76cc8b13 3783 if (qlen)
6e7676c1
CG
3784 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3785 &sd->process_queue);
76cc8b13 3786
6e7676c1 3787 if (qlen < quota - work) {
eecfd7c4
ED
3788 /*
3789 * Inline a custom version of __napi_complete().
3790 * only current cpu owns and manipulates this napi,
3791 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3792 * we can use a plain write instead of clear_bit(),
3793 * and we dont need an smp_mb() memory barrier.
3794 */
3795 list_del(&napi->poll_list);
3796 napi->state = 0;
3797
6e7676c1 3798 quota = work + qlen;
bea3348e 3799 }
e36fa2f7 3800 rps_unlock(sd);
6e7676c1
CG
3801 }
3802 local_irq_enable();
1da177e4 3803
bea3348e
SH
3804 return work;
3805}
1da177e4 3806
bea3348e
SH
3807/**
3808 * __napi_schedule - schedule for receive
c4ea43c5 3809 * @n: entry to schedule
bea3348e
SH
3810 *
3811 * The entry's receive function will be scheduled to run
3812 */
b5606c2d 3813void __napi_schedule(struct napi_struct *n)
bea3348e
SH
3814{
3815 unsigned long flags;
1da177e4 3816
bea3348e 3817 local_irq_save(flags);
eecfd7c4 3818 ____napi_schedule(&__get_cpu_var(softnet_data), n);
bea3348e 3819 local_irq_restore(flags);
1da177e4 3820}
bea3348e
SH
3821EXPORT_SYMBOL(__napi_schedule);
3822
d565b0a1
HX
3823void __napi_complete(struct napi_struct *n)
3824{
3825 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3826 BUG_ON(n->gro_list);
3827
3828 list_del(&n->poll_list);
3829 smp_mb__before_clear_bit();
3830 clear_bit(NAPI_STATE_SCHED, &n->state);
3831}
3832EXPORT_SYMBOL(__napi_complete);
3833
3834void napi_complete(struct napi_struct *n)
3835{
3836 unsigned long flags;
3837
3838 /*
3839 * don't let napi dequeue from the cpu poll list
3840 * just in case its running on a different cpu
3841 */
3842 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3843 return;
3844
3845 napi_gro_flush(n);
3846 local_irq_save(flags);
3847 __napi_complete(n);
3848 local_irq_restore(flags);
3849}
3850EXPORT_SYMBOL(napi_complete);
3851
3852void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3853 int (*poll)(struct napi_struct *, int), int weight)
3854{
3855 INIT_LIST_HEAD(&napi->poll_list);
4ae5544f 3856 napi->gro_count = 0;
d565b0a1 3857 napi->gro_list = NULL;
5d38a079 3858 napi->skb = NULL;
d565b0a1
HX
3859 napi->poll = poll;
3860 napi->weight = weight;
3861 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 3862 napi->dev = dev;
5d38a079 3863#ifdef CONFIG_NETPOLL
d565b0a1
HX
3864 spin_lock_init(&napi->poll_lock);
3865 napi->poll_owner = -1;
3866#endif
3867 set_bit(NAPI_STATE_SCHED, &napi->state);
3868}
3869EXPORT_SYMBOL(netif_napi_add);
3870
3871void netif_napi_del(struct napi_struct *napi)
3872{
3873 struct sk_buff *skb, *next;
3874
d7b06636 3875 list_del_init(&napi->dev_list);
76620aaf 3876 napi_free_frags(napi);
d565b0a1
HX
3877
3878 for (skb = napi->gro_list; skb; skb = next) {
3879 next = skb->next;
3880 skb->next = NULL;
3881 kfree_skb(skb);
3882 }
3883
3884 napi->gro_list = NULL;
4ae5544f 3885 napi->gro_count = 0;
d565b0a1
HX
3886}
3887EXPORT_SYMBOL(netif_napi_del);
3888
1da177e4
LT
3889static void net_rx_action(struct softirq_action *h)
3890{
e326bed2 3891 struct softnet_data *sd = &__get_cpu_var(softnet_data);
24f8b238 3892 unsigned long time_limit = jiffies + 2;
51b0bded 3893 int budget = netdev_budget;
53fb95d3
MM
3894 void *have;
3895
1da177e4
LT
3896 local_irq_disable();
3897
e326bed2 3898 while (!list_empty(&sd->poll_list)) {
bea3348e
SH
3899 struct napi_struct *n;
3900 int work, weight;
1da177e4 3901
bea3348e 3902 /* If softirq window is exhuasted then punt.
24f8b238
SH
3903 * Allow this to run for 2 jiffies since which will allow
3904 * an average latency of 1.5/HZ.
bea3348e 3905 */
24f8b238 3906 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
1da177e4
LT
3907 goto softnet_break;
3908
3909 local_irq_enable();
3910
bea3348e
SH
3911 /* Even though interrupts have been re-enabled, this
3912 * access is safe because interrupts can only add new
3913 * entries to the tail of this list, and only ->poll()
3914 * calls can remove this head entry from the list.
3915 */
e326bed2 3916 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
1da177e4 3917
bea3348e
SH
3918 have = netpoll_poll_lock(n);
3919
3920 weight = n->weight;
3921
0a7606c1
DM
3922 /* This NAPI_STATE_SCHED test is for avoiding a race
3923 * with netpoll's poll_napi(). Only the entity which
3924 * obtains the lock and sees NAPI_STATE_SCHED set will
3925 * actually make the ->poll() call. Therefore we avoid
25985edc 3926 * accidentally calling ->poll() when NAPI is not scheduled.
0a7606c1
DM
3927 */
3928 work = 0;
4ea7e386 3929 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
0a7606c1 3930 work = n->poll(n, weight);
4ea7e386
NH
3931 trace_napi_poll(n);
3932 }
bea3348e
SH
3933
3934 WARN_ON_ONCE(work > weight);
3935
3936 budget -= work;
3937
3938 local_irq_disable();
3939
3940 /* Drivers must not modify the NAPI state if they
3941 * consume the entire weight. In such cases this code
3942 * still "owns" the NAPI instance and therefore can
3943 * move the instance around on the list at-will.
3944 */
fed17f30 3945 if (unlikely(work == weight)) {
ff780cd8
HX
3946 if (unlikely(napi_disable_pending(n))) {
3947 local_irq_enable();
3948 napi_complete(n);
3949 local_irq_disable();
3950 } else
e326bed2 3951 list_move_tail(&n->poll_list, &sd->poll_list);
fed17f30 3952 }
bea3348e
SH
3953
3954 netpoll_poll_unlock(have);
1da177e4
LT
3955 }
3956out:
e326bed2 3957 net_rps_action_and_irq_enable(sd);
0a9627f2 3958
db217334
CL
3959#ifdef CONFIG_NET_DMA
3960 /*
3961 * There may not be any more sk_buffs coming right now, so push
3962 * any pending DMA copies to hardware
3963 */
2ba05622 3964 dma_issue_pending_all();
db217334 3965#endif
bea3348e 3966
1da177e4
LT
3967 return;
3968
3969softnet_break:
dee42870 3970 sd->time_squeeze++;
1da177e4
LT
3971 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3972 goto out;
3973}
3974
d1b19dff 3975static gifconf_func_t *gifconf_list[NPROTO];
1da177e4
LT
3976
3977/**
3978 * register_gifconf - register a SIOCGIF handler
3979 * @family: Address family
3980 * @gifconf: Function handler
3981 *
3982 * Register protocol dependent address dumping routines. The handler
3983 * that is passed must not be freed or reused until it has been replaced
3984 * by another handler.
3985 */
d1b19dff 3986int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
1da177e4
LT
3987{
3988 if (family >= NPROTO)
3989 return -EINVAL;
3990 gifconf_list[family] = gifconf;
3991 return 0;
3992}
d1b19dff 3993EXPORT_SYMBOL(register_gifconf);
1da177e4
LT
3994
3995
3996/*
3997 * Map an interface index to its name (SIOCGIFNAME)
3998 */
3999
4000/*
4001 * We need this ioctl for efficient implementation of the
4002 * if_indextoname() function required by the IPv6 API. Without
4003 * it, we would have to search all the interfaces to find a
4004 * match. --pb
4005 */
4006
881d966b 4007static int dev_ifname(struct net *net, struct ifreq __user *arg)
1da177e4
LT
4008{
4009 struct net_device *dev;
4010 struct ifreq ifr;
4011
4012 /*
4013 * Fetch the caller's info block.
4014 */
4015
4016 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4017 return -EFAULT;
4018
fb699dfd
ED
4019 rcu_read_lock();
4020 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
1da177e4 4021 if (!dev) {
fb699dfd 4022 rcu_read_unlock();
1da177e4
LT
4023 return -ENODEV;
4024 }
4025
4026 strcpy(ifr.ifr_name, dev->name);
fb699dfd 4027 rcu_read_unlock();
1da177e4
LT
4028
4029 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4030 return -EFAULT;
4031 return 0;
4032}
4033
4034/*
4035 * Perform a SIOCGIFCONF call. This structure will change
4036 * size eventually, and there is nothing I can do about it.
4037 * Thus we will need a 'compatibility mode'.
4038 */
4039
881d966b 4040static int dev_ifconf(struct net *net, char __user *arg)
1da177e4
LT
4041{
4042 struct ifconf ifc;
4043 struct net_device *dev;
4044 char __user *pos;
4045 int len;
4046 int total;
4047 int i;
4048
4049 /*
4050 * Fetch the caller's info block.
4051 */
4052
4053 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4054 return -EFAULT;
4055
4056 pos = ifc.ifc_buf;
4057 len = ifc.ifc_len;
4058
4059 /*
4060 * Loop over the interfaces, and write an info block for each.
4061 */
4062
4063 total = 0;
881d966b 4064 for_each_netdev(net, dev) {
1da177e4
LT
4065 for (i = 0; i < NPROTO; i++) {
4066 if (gifconf_list[i]) {
4067 int done;
4068 if (!pos)
4069 done = gifconf_list[i](dev, NULL, 0);
4070 else
4071 done = gifconf_list[i](dev, pos + total,
4072 len - total);
4073 if (done < 0)
4074 return -EFAULT;
4075 total += done;
4076 }
4077 }
4ec93edb 4078 }
1da177e4
LT
4079
4080 /*
4081 * All done. Write the updated control block back to the caller.
4082 */
4083 ifc.ifc_len = total;
4084
4085 /*
4086 * Both BSD and Solaris return 0 here, so we do too.
4087 */
4088 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4089}
4090
4091#ifdef CONFIG_PROC_FS
f04565dd
MM
4092
4093#define BUCKET_SPACE (32 - NETDEV_HASHBITS)
4094
4095struct dev_iter_state {
4096 struct seq_net_private p;
4097 unsigned int pos; /* bucket << BUCKET_SPACE + offset */
4098};
4099
4100#define get_bucket(x) ((x) >> BUCKET_SPACE)
4101#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4102#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4103
4104static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
4105{
4106 struct dev_iter_state *state = seq->private;
4107 struct net *net = seq_file_net(seq);
4108 struct net_device *dev;
4109 struct hlist_node *p;
4110 struct hlist_head *h;
4111 unsigned int count, bucket, offset;
4112
4113 bucket = get_bucket(state->pos);
4114 offset = get_offset(state->pos);
4115 h = &net->dev_name_head[bucket];
4116 count = 0;
4117 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4118 if (count++ == offset) {
4119 state->pos = set_bucket_offset(bucket, count);
4120 return dev;
4121 }
4122 }
4123
4124 return NULL;
4125}
4126
4127static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
4128{
4129 struct dev_iter_state *state = seq->private;
4130 struct net_device *dev;
4131 unsigned int bucket;
4132
4133 bucket = get_bucket(state->pos);
4134 do {
4135 dev = dev_from_same_bucket(seq);
4136 if (dev)
4137 return dev;
4138
4139 bucket++;
4140 state->pos = set_bucket_offset(bucket, 0);
4141 } while (bucket < NETDEV_HASHENTRIES);
4142
4143 return NULL;
4144}
4145
1da177e4
LT
4146/*
4147 * This is invoked by the /proc filesystem handler to display a device
4148 * in detail.
4149 */
7562f876 4150void *dev_seq_start(struct seq_file *seq, loff_t *pos)
c6d14c84 4151 __acquires(RCU)
1da177e4 4152{
f04565dd 4153 struct dev_iter_state *state = seq->private;
1da177e4 4154
c6d14c84 4155 rcu_read_lock();
7562f876
PE
4156 if (!*pos)
4157 return SEQ_START_TOKEN;
1da177e4 4158
f04565dd
MM
4159 /* check for end of the hash */
4160 if (state->pos == 0 && *pos > 1)
4161 return NULL;
1da177e4 4162
f04565dd 4163 return dev_from_new_bucket(seq);
1da177e4
LT
4164}
4165
4166void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4167{
f04565dd
MM
4168 struct net_device *dev;
4169
4170 ++*pos;
ccf43438
ED
4171
4172 if (v == SEQ_START_TOKEN)
f04565dd 4173 return dev_from_new_bucket(seq);
c6d14c84 4174
f04565dd
MM
4175 dev = dev_from_same_bucket(seq);
4176 if (dev)
4177 return dev;
4178
4179 return dev_from_new_bucket(seq);
1da177e4
LT
4180}
4181
4182void dev_seq_stop(struct seq_file *seq, void *v)
c6d14c84 4183 __releases(RCU)
1da177e4 4184{
c6d14c84 4185 rcu_read_unlock();
1da177e4
LT
4186}
4187
4188static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4189{
28172739
ED
4190 struct rtnl_link_stats64 temp;
4191 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
1da177e4 4192
be1f3c2c
BH
4193 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4194 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
5a1b5898
RR
4195 dev->name, stats->rx_bytes, stats->rx_packets,
4196 stats->rx_errors,
4197 stats->rx_dropped + stats->rx_missed_errors,
4198 stats->rx_fifo_errors,
4199 stats->rx_length_errors + stats->rx_over_errors +
4200 stats->rx_crc_errors + stats->rx_frame_errors,
4201 stats->rx_compressed, stats->multicast,
4202 stats->tx_bytes, stats->tx_packets,
4203 stats->tx_errors, stats->tx_dropped,
4204 stats->tx_fifo_errors, stats->collisions,
4205 stats->tx_carrier_errors +
4206 stats->tx_aborted_errors +
4207 stats->tx_window_errors +
4208 stats->tx_heartbeat_errors,
4209 stats->tx_compressed);
1da177e4
LT
4210}
4211
4212/*
4213 * Called from the PROCfs module. This now uses the new arbitrary sized
4214 * /proc/net interface to create /proc/net/dev
4215 */
4216static int dev_seq_show(struct seq_file *seq, void *v)
4217{
4218 if (v == SEQ_START_TOKEN)
4219 seq_puts(seq, "Inter-| Receive "
4220 " | Transmit\n"
4221 " face |bytes packets errs drop fifo frame "
4222 "compressed multicast|bytes packets errs "
4223 "drop fifo colls carrier compressed\n");
4224 else
4225 dev_seq_printf_stats(seq, v);
4226 return 0;
4227}
4228
dee42870 4229static struct softnet_data *softnet_get_online(loff_t *pos)
1da177e4 4230{
dee42870 4231 struct softnet_data *sd = NULL;
1da177e4 4232
0c0b0aca 4233 while (*pos < nr_cpu_ids)
4ec93edb 4234 if (cpu_online(*pos)) {
dee42870 4235 sd = &per_cpu(softnet_data, *pos);
1da177e4
LT
4236 break;
4237 } else
4238 ++*pos;
dee42870 4239 return sd;
1da177e4
LT
4240}
4241
4242static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4243{
4244 return softnet_get_online(pos);
4245}
4246
4247static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4248{
4249 ++*pos;
4250 return softnet_get_online(pos);
4251}
4252
4253static void softnet_seq_stop(struct seq_file *seq, void *v)
4254{
4255}
4256
4257static int softnet_seq_show(struct seq_file *seq, void *v)
4258{
dee42870 4259 struct softnet_data *sd = v;
1da177e4 4260
0a9627f2 4261 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
dee42870 4262 sd->processed, sd->dropped, sd->time_squeeze, 0,
c1ebcdb8 4263 0, 0, 0, 0, /* was fastroute */
dee42870 4264 sd->cpu_collision, sd->received_rps);
1da177e4
LT
4265 return 0;
4266}
4267
f690808e 4268static const struct seq_operations dev_seq_ops = {
1da177e4
LT
4269 .start = dev_seq_start,
4270 .next = dev_seq_next,
4271 .stop = dev_seq_stop,
4272 .show = dev_seq_show,
4273};
4274
4275static int dev_seq_open(struct inode *inode, struct file *file)
4276{
e372c414 4277 return seq_open_net(inode, file, &dev_seq_ops,
f04565dd 4278 sizeof(struct dev_iter_state));
1da177e4
LT
4279}
4280
9a32144e 4281static const struct file_operations dev_seq_fops = {
1da177e4
LT
4282 .owner = THIS_MODULE,
4283 .open = dev_seq_open,
4284 .read = seq_read,
4285 .llseek = seq_lseek,
e372c414 4286 .release = seq_release_net,
1da177e4
LT
4287};
4288
f690808e 4289static const struct seq_operations softnet_seq_ops = {
1da177e4
LT
4290 .start = softnet_seq_start,
4291 .next = softnet_seq_next,
4292 .stop = softnet_seq_stop,
4293 .show = softnet_seq_show,
4294};
4295
4296static int softnet_seq_open(struct inode *inode, struct file *file)
4297{
4298 return seq_open(file, &softnet_seq_ops);
4299}
4300
9a32144e 4301static const struct file_operations softnet_seq_fops = {
1da177e4
LT
4302 .owner = THIS_MODULE,
4303 .open = softnet_seq_open,
4304 .read = seq_read,
4305 .llseek = seq_lseek,
4306 .release = seq_release,
4307};
4308
0e1256ff
SH
4309static void *ptype_get_idx(loff_t pos)
4310{
4311 struct packet_type *pt = NULL;
4312 loff_t i = 0;
4313 int t;
4314
4315 list_for_each_entry_rcu(pt, &ptype_all, list) {
4316 if (i == pos)
4317 return pt;
4318 ++i;
4319 }
4320
82d8a867 4321 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
0e1256ff
SH
4322 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4323 if (i == pos)
4324 return pt;
4325 ++i;
4326 }
4327 }
4328 return NULL;
4329}
4330
4331static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
72348a42 4332 __acquires(RCU)
0e1256ff
SH
4333{
4334 rcu_read_lock();
4335 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4336}
4337
4338static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4339{
4340 struct packet_type *pt;
4341 struct list_head *nxt;
4342 int hash;
4343
4344 ++*pos;
4345 if (v == SEQ_START_TOKEN)
4346 return ptype_get_idx(0);
4347
4348 pt = v;
4349 nxt = pt->list.next;
4350 if (pt->type == htons(ETH_P_ALL)) {
4351 if (nxt != &ptype_all)
4352 goto found;
4353 hash = 0;
4354 nxt = ptype_base[0].next;
4355 } else
82d8a867 4356 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
0e1256ff
SH
4357
4358 while (nxt == &ptype_base[hash]) {
82d8a867 4359 if (++hash >= PTYPE_HASH_SIZE)
0e1256ff
SH
4360 return NULL;
4361 nxt = ptype_base[hash].next;
4362 }
4363found:
4364 return list_entry(nxt, struct packet_type, list);
4365}
4366
4367static void ptype_seq_stop(struct seq_file *seq, void *v)
72348a42 4368 __releases(RCU)
0e1256ff
SH
4369{
4370 rcu_read_unlock();
4371}
4372
0e1256ff
SH
4373static int ptype_seq_show(struct seq_file *seq, void *v)
4374{
4375 struct packet_type *pt = v;
4376
4377 if (v == SEQ_START_TOKEN)
4378 seq_puts(seq, "Type Device Function\n");
c346dca1 4379 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
0e1256ff
SH
4380 if (pt->type == htons(ETH_P_ALL))
4381 seq_puts(seq, "ALL ");
4382 else
4383 seq_printf(seq, "%04x", ntohs(pt->type));
4384
908cd2da
AD
4385 seq_printf(seq, " %-8s %pF\n",
4386 pt->dev ? pt->dev->name : "", pt->func);
0e1256ff
SH
4387 }
4388
4389 return 0;
4390}
4391
4392static const struct seq_operations ptype_seq_ops = {
4393 .start = ptype_seq_start,
4394 .next = ptype_seq_next,
4395 .stop = ptype_seq_stop,
4396 .show = ptype_seq_show,
4397};
4398
4399static int ptype_seq_open(struct inode *inode, struct file *file)
4400{
2feb27db
PE
4401 return seq_open_net(inode, file, &ptype_seq_ops,
4402 sizeof(struct seq_net_private));
0e1256ff
SH
4403}
4404
4405static const struct file_operations ptype_seq_fops = {
4406 .owner = THIS_MODULE,
4407 .open = ptype_seq_open,
4408 .read = seq_read,
4409 .llseek = seq_lseek,
2feb27db 4410 .release = seq_release_net,
0e1256ff
SH
4411};
4412
4413
4665079c 4414static int __net_init dev_proc_net_init(struct net *net)
1da177e4
LT
4415{
4416 int rc = -ENOMEM;
4417
881d966b 4418 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
1da177e4 4419 goto out;
881d966b 4420 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
1da177e4 4421 goto out_dev;
881d966b 4422 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
457c4cbc 4423 goto out_softnet;
0e1256ff 4424
881d966b 4425 if (wext_proc_init(net))
457c4cbc 4426 goto out_ptype;
1da177e4
LT
4427 rc = 0;
4428out:
4429 return rc;
457c4cbc 4430out_ptype:
881d966b 4431 proc_net_remove(net, "ptype");
1da177e4 4432out_softnet:
881d966b 4433 proc_net_remove(net, "softnet_stat");
1da177e4 4434out_dev:
881d966b 4435 proc_net_remove(net, "dev");
1da177e4
LT
4436 goto out;
4437}
881d966b 4438
4665079c 4439static void __net_exit dev_proc_net_exit(struct net *net)
881d966b
EB
4440{
4441 wext_proc_exit(net);
4442
4443 proc_net_remove(net, "ptype");
4444 proc_net_remove(net, "softnet_stat");
4445 proc_net_remove(net, "dev");
4446}
4447
022cbae6 4448static struct pernet_operations __net_initdata dev_proc_ops = {
881d966b
EB
4449 .init = dev_proc_net_init,
4450 .exit = dev_proc_net_exit,
4451};
4452
4453static int __init dev_proc_init(void)
4454{
4455 return register_pernet_subsys(&dev_proc_ops);
4456}
1da177e4
LT
4457#else
4458#define dev_proc_init() 0
4459#endif /* CONFIG_PROC_FS */
4460
4461
4462/**
1765a575 4463 * netdev_set_master - set up master pointer
1da177e4
LT
4464 * @slave: slave device
4465 * @master: new master device
4466 *
4467 * Changes the master device of the slave. Pass %NULL to break the
4468 * bonding. The caller must hold the RTNL semaphore. On a failure
4469 * a negative errno code is returned. On success the reference counts
1765a575 4470 * are adjusted and the function returns zero.
1da177e4
LT
4471 */
4472int netdev_set_master(struct net_device *slave, struct net_device *master)
4473{
4474 struct net_device *old = slave->master;
4475
4476 ASSERT_RTNL();
4477
4478 if (master) {
4479 if (old)
4480 return -EBUSY;
4481 dev_hold(master);
4482 }
4483
4484 slave->master = master;
4ec93edb 4485
6df427fe 4486 if (old)
1da177e4 4487 dev_put(old);
1765a575
JP
4488 return 0;
4489}
4490EXPORT_SYMBOL(netdev_set_master);
4491
4492/**
4493 * netdev_set_bond_master - set up bonding master/slave pair
4494 * @slave: slave device
4495 * @master: new master device
4496 *
4497 * Changes the master device of the slave. Pass %NULL to break the
4498 * bonding. The caller must hold the RTNL semaphore. On a failure
4499 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4500 * to the routing socket and the function returns zero.
4501 */
4502int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4503{
4504 int err;
4505
4506 ASSERT_RTNL();
4507
4508 err = netdev_set_master(slave, master);
4509 if (err)
4510 return err;
1da177e4
LT
4511 if (master)
4512 slave->flags |= IFF_SLAVE;
4513 else
4514 slave->flags &= ~IFF_SLAVE;
4515
4516 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4517 return 0;
4518}
1765a575 4519EXPORT_SYMBOL(netdev_set_bond_master);
1da177e4 4520
b6c40d68
PM
4521static void dev_change_rx_flags(struct net_device *dev, int flags)
4522{
d314774c
SH
4523 const struct net_device_ops *ops = dev->netdev_ops;
4524
4525 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4526 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
4527}
4528
dad9b335 4529static int __dev_set_promiscuity(struct net_device *dev, int inc)
1da177e4
LT
4530{
4531 unsigned short old_flags = dev->flags;
8192b0c4
DH
4532 uid_t uid;
4533 gid_t gid;
1da177e4 4534
24023451
PM
4535 ASSERT_RTNL();
4536
dad9b335
WC
4537 dev->flags |= IFF_PROMISC;
4538 dev->promiscuity += inc;
4539 if (dev->promiscuity == 0) {
4540 /*
4541 * Avoid overflow.
4542 * If inc causes overflow, untouch promisc and return error.
4543 */
4544 if (inc < 0)
4545 dev->flags &= ~IFF_PROMISC;
4546 else {
4547 dev->promiscuity -= inc;
4548 printk(KERN_WARNING "%s: promiscuity touches roof, "
4549 "set promiscuity failed, promiscuity feature "
4550 "of device might be broken.\n", dev->name);
4551 return -EOVERFLOW;
4552 }
4553 }
52609c0b 4554 if (dev->flags != old_flags) {
1da177e4
LT
4555 printk(KERN_INFO "device %s %s promiscuous mode\n",
4556 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4ec93edb 4557 "left");
8192b0c4
DH
4558 if (audit_enabled) {
4559 current_uid_gid(&uid, &gid);
7759db82
KHK
4560 audit_log(current->audit_context, GFP_ATOMIC,
4561 AUDIT_ANOM_PROMISCUOUS,
4562 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4563 dev->name, (dev->flags & IFF_PROMISC),
4564 (old_flags & IFF_PROMISC),
4565 audit_get_loginuid(current),
8192b0c4 4566 uid, gid,
7759db82 4567 audit_get_sessionid(current));
8192b0c4 4568 }
24023451 4569
b6c40d68 4570 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 4571 }
dad9b335 4572 return 0;
1da177e4
LT
4573}
4574
4417da66
PM
4575/**
4576 * dev_set_promiscuity - update promiscuity count on a device
4577 * @dev: device
4578 * @inc: modifier
4579 *
4580 * Add or remove promiscuity from a device. While the count in the device
4581 * remains above zero the interface remains promiscuous. Once it hits zero
4582 * the device reverts back to normal filtering operation. A negative inc
4583 * value is used to drop promiscuity on the device.
dad9b335 4584 * Return 0 if successful or a negative errno code on error.
4417da66 4585 */
dad9b335 4586int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66
PM
4587{
4588 unsigned short old_flags = dev->flags;
dad9b335 4589 int err;
4417da66 4590
dad9b335 4591 err = __dev_set_promiscuity(dev, inc);
4b5a698e 4592 if (err < 0)
dad9b335 4593 return err;
4417da66
PM
4594 if (dev->flags != old_flags)
4595 dev_set_rx_mode(dev);
dad9b335 4596 return err;
4417da66 4597}
d1b19dff 4598EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 4599
1da177e4
LT
4600/**
4601 * dev_set_allmulti - update allmulti count on a device
4602 * @dev: device
4603 * @inc: modifier
4604 *
4605 * Add or remove reception of all multicast frames to a device. While the
4606 * count in the device remains above zero the interface remains listening
4607 * to all interfaces. Once it hits zero the device reverts back to normal
4608 * filtering operation. A negative @inc value is used to drop the counter
4609 * when releasing a resource needing all multicasts.
dad9b335 4610 * Return 0 if successful or a negative errno code on error.
1da177e4
LT
4611 */
4612
dad9b335 4613int dev_set_allmulti(struct net_device *dev, int inc)
1da177e4
LT
4614{
4615 unsigned short old_flags = dev->flags;
4616
24023451
PM
4617 ASSERT_RTNL();
4618
1da177e4 4619 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
4620 dev->allmulti += inc;
4621 if (dev->allmulti == 0) {
4622 /*
4623 * Avoid overflow.
4624 * If inc causes overflow, untouch allmulti and return error.
4625 */
4626 if (inc < 0)
4627 dev->flags &= ~IFF_ALLMULTI;
4628 else {
4629 dev->allmulti -= inc;
4630 printk(KERN_WARNING "%s: allmulti touches roof, "
4631 "set allmulti failed, allmulti feature of "
4632 "device might be broken.\n", dev->name);
4633 return -EOVERFLOW;
4634 }
4635 }
24023451 4636 if (dev->flags ^ old_flags) {
b6c40d68 4637 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 4638 dev_set_rx_mode(dev);
24023451 4639 }
dad9b335 4640 return 0;
4417da66 4641}
d1b19dff 4642EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
4643
4644/*
4645 * Upload unicast and multicast address lists to device and
4646 * configure RX filtering. When the device doesn't support unicast
53ccaae1 4647 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
4648 * are present.
4649 */
4650void __dev_set_rx_mode(struct net_device *dev)
4651{
d314774c
SH
4652 const struct net_device_ops *ops = dev->netdev_ops;
4653
4417da66
PM
4654 /* dev_open will call this function so the list will stay sane. */
4655 if (!(dev->flags&IFF_UP))
4656 return;
4657
4658 if (!netif_device_present(dev))
40b77c94 4659 return;
4417da66 4660
01789349 4661 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4417da66
PM
4662 /* Unicast addresses changes may only happen under the rtnl,
4663 * therefore calling __dev_set_promiscuity here is safe.
4664 */
32e7bfc4 4665 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4417da66 4666 __dev_set_promiscuity(dev, 1);
2d348d1f 4667 dev->uc_promisc = true;
32e7bfc4 4668 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4417da66 4669 __dev_set_promiscuity(dev, -1);
2d348d1f 4670 dev->uc_promisc = false;
4417da66 4671 }
4417da66 4672 }
01789349
JP
4673
4674 if (ops->ndo_set_rx_mode)
4675 ops->ndo_set_rx_mode(dev);
4417da66
PM
4676}
4677
4678void dev_set_rx_mode(struct net_device *dev)
4679{
b9e40857 4680 netif_addr_lock_bh(dev);
4417da66 4681 __dev_set_rx_mode(dev);
b9e40857 4682 netif_addr_unlock_bh(dev);
1da177e4
LT
4683}
4684
f0db275a
SH
4685/**
4686 * dev_get_flags - get flags reported to userspace
4687 * @dev: device
4688 *
4689 * Get the combination of flag bits exported through APIs to userspace.
4690 */
1da177e4
LT
4691unsigned dev_get_flags(const struct net_device *dev)
4692{
4693 unsigned flags;
4694
4695 flags = (dev->flags & ~(IFF_PROMISC |
4696 IFF_ALLMULTI |
b00055aa
SR
4697 IFF_RUNNING |
4698 IFF_LOWER_UP |
4699 IFF_DORMANT)) |
1da177e4
LT
4700 (dev->gflags & (IFF_PROMISC |
4701 IFF_ALLMULTI));
4702
b00055aa
SR
4703 if (netif_running(dev)) {
4704 if (netif_oper_up(dev))
4705 flags |= IFF_RUNNING;
4706 if (netif_carrier_ok(dev))
4707 flags |= IFF_LOWER_UP;
4708 if (netif_dormant(dev))
4709 flags |= IFF_DORMANT;
4710 }
1da177e4
LT
4711
4712 return flags;
4713}
d1b19dff 4714EXPORT_SYMBOL(dev_get_flags);
1da177e4 4715
bd380811 4716int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 4717{
1da177e4 4718 int old_flags = dev->flags;
bd380811 4719 int ret;
1da177e4 4720
24023451
PM
4721 ASSERT_RTNL();
4722
1da177e4
LT
4723 /*
4724 * Set the flags on our device.
4725 */
4726
4727 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4728 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4729 IFF_AUTOMEDIA)) |
4730 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4731 IFF_ALLMULTI));
4732
4733 /*
4734 * Load in the correct multicast list now the flags have changed.
4735 */
4736
b6c40d68
PM
4737 if ((old_flags ^ flags) & IFF_MULTICAST)
4738 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 4739
4417da66 4740 dev_set_rx_mode(dev);
1da177e4
LT
4741
4742 /*
4743 * Have we downed the interface. We handle IFF_UP ourselves
4744 * according to user attempts to set it, rather than blindly
4745 * setting it.
4746 */
4747
4748 ret = 0;
4749 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
bd380811 4750 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4
LT
4751
4752 if (!ret)
4417da66 4753 dev_set_rx_mode(dev);
1da177e4
LT
4754 }
4755
1da177e4 4756 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff
ED
4757 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4758
1da177e4
LT
4759 dev->gflags ^= IFF_PROMISC;
4760 dev_set_promiscuity(dev, inc);
4761 }
4762
4763 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4764 is important. Some (broken) drivers set IFF_PROMISC, when
4765 IFF_ALLMULTI is requested not asking us and not reporting.
4766 */
4767 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
4768 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4769
1da177e4
LT
4770 dev->gflags ^= IFF_ALLMULTI;
4771 dev_set_allmulti(dev, inc);
4772 }
4773
bd380811
PM
4774 return ret;
4775}
4776
4777void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4778{
4779 unsigned int changes = dev->flags ^ old_flags;
4780
4781 if (changes & IFF_UP) {
4782 if (dev->flags & IFF_UP)
4783 call_netdevice_notifiers(NETDEV_UP, dev);
4784 else
4785 call_netdevice_notifiers(NETDEV_DOWN, dev);
4786 }
4787
4788 if (dev->flags & IFF_UP &&
4789 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4790 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4791}
4792
4793/**
4794 * dev_change_flags - change device settings
4795 * @dev: device
4796 * @flags: device state flags
4797 *
4798 * Change settings on device based state flags. The flags are
4799 * in the userspace exported format.
4800 */
4801int dev_change_flags(struct net_device *dev, unsigned flags)
4802{
4803 int ret, changes;
4804 int old_flags = dev->flags;
4805
4806 ret = __dev_change_flags(dev, flags);
4807 if (ret < 0)
4808 return ret;
4809
4810 changes = old_flags ^ dev->flags;
7c355f53
TG
4811 if (changes)
4812 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
1da177e4 4813
bd380811 4814 __dev_notify_flags(dev, old_flags);
1da177e4
LT
4815 return ret;
4816}
d1b19dff 4817EXPORT_SYMBOL(dev_change_flags);
1da177e4 4818
f0db275a
SH
4819/**
4820 * dev_set_mtu - Change maximum transfer unit
4821 * @dev: device
4822 * @new_mtu: new transfer unit
4823 *
4824 * Change the maximum transfer size of the network device.
4825 */
1da177e4
LT
4826int dev_set_mtu(struct net_device *dev, int new_mtu)
4827{
d314774c 4828 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4829 int err;
4830
4831 if (new_mtu == dev->mtu)
4832 return 0;
4833
4834 /* MTU must be positive. */
4835 if (new_mtu < 0)
4836 return -EINVAL;
4837
4838 if (!netif_device_present(dev))
4839 return -ENODEV;
4840
4841 err = 0;
d314774c
SH
4842 if (ops->ndo_change_mtu)
4843 err = ops->ndo_change_mtu(dev, new_mtu);
1da177e4
LT
4844 else
4845 dev->mtu = new_mtu;
d314774c 4846
1da177e4 4847 if (!err && dev->flags & IFF_UP)
056925ab 4848 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
1da177e4
LT
4849 return err;
4850}
d1b19dff 4851EXPORT_SYMBOL(dev_set_mtu);
1da177e4 4852
cbda10fa
VD
4853/**
4854 * dev_set_group - Change group this device belongs to
4855 * @dev: device
4856 * @new_group: group this device should belong to
4857 */
4858void dev_set_group(struct net_device *dev, int new_group)
4859{
4860 dev->group = new_group;
4861}
4862EXPORT_SYMBOL(dev_set_group);
4863
f0db275a
SH
4864/**
4865 * dev_set_mac_address - Change Media Access Control Address
4866 * @dev: device
4867 * @sa: new address
4868 *
4869 * Change the hardware (MAC) address of the device
4870 */
1da177e4
LT
4871int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4872{
d314774c 4873 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4874 int err;
4875
d314774c 4876 if (!ops->ndo_set_mac_address)
1da177e4
LT
4877 return -EOPNOTSUPP;
4878 if (sa->sa_family != dev->type)
4879 return -EINVAL;
4880 if (!netif_device_present(dev))
4881 return -ENODEV;
d314774c 4882 err = ops->ndo_set_mac_address(dev, sa);
1da177e4 4883 if (!err)
056925ab 4884 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
1da177e4
LT
4885 return err;
4886}
d1b19dff 4887EXPORT_SYMBOL(dev_set_mac_address);
1da177e4
LT
4888
4889/*
3710becf 4890 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
1da177e4 4891 */
14e3e079 4892static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
1da177e4
LT
4893{
4894 int err;
3710becf 4895 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
1da177e4
LT
4896
4897 if (!dev)
4898 return -ENODEV;
4899
4900 switch (cmd) {
d1b19dff
ED
4901 case SIOCGIFFLAGS: /* Get interface flags */
4902 ifr->ifr_flags = (short) dev_get_flags(dev);
4903 return 0;
1da177e4 4904
d1b19dff
ED
4905 case SIOCGIFMETRIC: /* Get the metric on the interface
4906 (currently unused) */
4907 ifr->ifr_metric = 0;
4908 return 0;
1da177e4 4909
d1b19dff
ED
4910 case SIOCGIFMTU: /* Get the MTU of a device */
4911 ifr->ifr_mtu = dev->mtu;
4912 return 0;
1da177e4 4913
d1b19dff
ED
4914 case SIOCGIFHWADDR:
4915 if (!dev->addr_len)
4916 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4917 else
4918 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4919 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4920 ifr->ifr_hwaddr.sa_family = dev->type;
4921 return 0;
1da177e4 4922
d1b19dff
ED
4923 case SIOCGIFSLAVE:
4924 err = -EINVAL;
4925 break;
14e3e079 4926
d1b19dff
ED
4927 case SIOCGIFMAP:
4928 ifr->ifr_map.mem_start = dev->mem_start;
4929 ifr->ifr_map.mem_end = dev->mem_end;
4930 ifr->ifr_map.base_addr = dev->base_addr;
4931 ifr->ifr_map.irq = dev->irq;
4932 ifr->ifr_map.dma = dev->dma;
4933 ifr->ifr_map.port = dev->if_port;
4934 return 0;
14e3e079 4935
d1b19dff
ED
4936 case SIOCGIFINDEX:
4937 ifr->ifr_ifindex = dev->ifindex;
4938 return 0;
14e3e079 4939
d1b19dff
ED
4940 case SIOCGIFTXQLEN:
4941 ifr->ifr_qlen = dev->tx_queue_len;
4942 return 0;
14e3e079 4943
d1b19dff
ED
4944 default:
4945 /* dev_ioctl() should ensure this case
4946 * is never reached
4947 */
4948 WARN_ON(1);
41c31f31 4949 err = -ENOTTY;
d1b19dff 4950 break;
14e3e079
JG
4951
4952 }
4953 return err;
4954}
4955
4956/*
4957 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4958 */
4959static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4960{
4961 int err;
4962 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5f2f6da7 4963 const struct net_device_ops *ops;
14e3e079
JG
4964
4965 if (!dev)
4966 return -ENODEV;
4967
5f2f6da7
JP
4968 ops = dev->netdev_ops;
4969
14e3e079 4970 switch (cmd) {
d1b19dff
ED
4971 case SIOCSIFFLAGS: /* Set interface flags */
4972 return dev_change_flags(dev, ifr->ifr_flags);
14e3e079 4973
d1b19dff
ED
4974 case SIOCSIFMETRIC: /* Set the metric on the interface
4975 (currently unused) */
4976 return -EOPNOTSUPP;
14e3e079 4977
d1b19dff
ED
4978 case SIOCSIFMTU: /* Set the MTU of a device */
4979 return dev_set_mtu(dev, ifr->ifr_mtu);
1da177e4 4980
d1b19dff
ED
4981 case SIOCSIFHWADDR:
4982 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
1da177e4 4983
d1b19dff
ED
4984 case SIOCSIFHWBROADCAST:
4985 if (ifr->ifr_hwaddr.sa_family != dev->type)
4986 return -EINVAL;
4987 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4988 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4989 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4990 return 0;
1da177e4 4991
d1b19dff
ED
4992 case SIOCSIFMAP:
4993 if (ops->ndo_set_config) {
1da177e4
LT
4994 if (!netif_device_present(dev))
4995 return -ENODEV;
d1b19dff
ED
4996 return ops->ndo_set_config(dev, &ifr->ifr_map);
4997 }
4998 return -EOPNOTSUPP;
1da177e4 4999
d1b19dff 5000 case SIOCADDMULTI:
b81693d9 5001 if (!ops->ndo_set_rx_mode ||
d1b19dff
ED
5002 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5003 return -EINVAL;
5004 if (!netif_device_present(dev))
5005 return -ENODEV;
22bedad3 5006 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
d1b19dff
ED
5007
5008 case SIOCDELMULTI:
b81693d9 5009 if (!ops->ndo_set_rx_mode ||
d1b19dff
ED
5010 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5011 return -EINVAL;
5012 if (!netif_device_present(dev))
5013 return -ENODEV;
22bedad3 5014 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
1da177e4 5015
d1b19dff
ED
5016 case SIOCSIFTXQLEN:
5017 if (ifr->ifr_qlen < 0)
5018 return -EINVAL;
5019 dev->tx_queue_len = ifr->ifr_qlen;
5020 return 0;
1da177e4 5021
d1b19dff
ED
5022 case SIOCSIFNAME:
5023 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5024 return dev_change_name(dev, ifr->ifr_newname);
1da177e4 5025
4dc360c5
RC
5026 case SIOCSHWTSTAMP:
5027 err = net_hwtstamp_validate(ifr);
5028 if (err)
5029 return err;
5030 /* fall through */
5031
d1b19dff
ED
5032 /*
5033 * Unknown or private ioctl
5034 */
5035 default:
5036 if ((cmd >= SIOCDEVPRIVATE &&
5037 cmd <= SIOCDEVPRIVATE + 15) ||
5038 cmd == SIOCBONDENSLAVE ||
5039 cmd == SIOCBONDRELEASE ||
5040 cmd == SIOCBONDSETHWADDR ||
5041 cmd == SIOCBONDSLAVEINFOQUERY ||
5042 cmd == SIOCBONDINFOQUERY ||
5043 cmd == SIOCBONDCHANGEACTIVE ||
5044 cmd == SIOCGMIIPHY ||
5045 cmd == SIOCGMIIREG ||
5046 cmd == SIOCSMIIREG ||
5047 cmd == SIOCBRADDIF ||
5048 cmd == SIOCBRDELIF ||
5049 cmd == SIOCSHWTSTAMP ||
5050 cmd == SIOCWANDEV) {
5051 err = -EOPNOTSUPP;
5052 if (ops->ndo_do_ioctl) {
5053 if (netif_device_present(dev))
5054 err = ops->ndo_do_ioctl(dev, ifr, cmd);
5055 else
5056 err = -ENODEV;
5057 }
5058 } else
5059 err = -EINVAL;
1da177e4
LT
5060
5061 }
5062 return err;
5063}
5064
5065/*
5066 * This function handles all "interface"-type I/O control requests. The actual
5067 * 'doing' part of this is dev_ifsioc above.
5068 */
5069
5070/**
5071 * dev_ioctl - network device ioctl
c4ea43c5 5072 * @net: the applicable net namespace
1da177e4
LT
5073 * @cmd: command to issue
5074 * @arg: pointer to a struct ifreq in user space
5075 *
5076 * Issue ioctl functions to devices. This is normally called by the
5077 * user space syscall interfaces but can sometimes be useful for
5078 * other purposes. The return value is the return from the syscall if
5079 * positive or a negative errno code on error.
5080 */
5081
881d966b 5082int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1da177e4
LT
5083{
5084 struct ifreq ifr;
5085 int ret;
5086 char *colon;
5087
5088 /* One special case: SIOCGIFCONF takes ifconf argument
5089 and requires shared lock, because it sleeps writing
5090 to user space.
5091 */
5092
5093 if (cmd == SIOCGIFCONF) {
6756ae4b 5094 rtnl_lock();
881d966b 5095 ret = dev_ifconf(net, (char __user *) arg);
6756ae4b 5096 rtnl_unlock();
1da177e4
LT
5097 return ret;
5098 }
5099 if (cmd == SIOCGIFNAME)
881d966b 5100 return dev_ifname(net, (struct ifreq __user *)arg);
1da177e4
LT
5101
5102 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5103 return -EFAULT;
5104
5105 ifr.ifr_name[IFNAMSIZ-1] = 0;
5106
5107 colon = strchr(ifr.ifr_name, ':');
5108 if (colon)
5109 *colon = 0;
5110
5111 /*
5112 * See which interface the caller is talking about.
5113 */
5114
5115 switch (cmd) {
d1b19dff
ED
5116 /*
5117 * These ioctl calls:
5118 * - can be done by all.
5119 * - atomic and do not require locking.
5120 * - return a value
5121 */
5122 case SIOCGIFFLAGS:
5123 case SIOCGIFMETRIC:
5124 case SIOCGIFMTU:
5125 case SIOCGIFHWADDR:
5126 case SIOCGIFSLAVE:
5127 case SIOCGIFMAP:
5128 case SIOCGIFINDEX:
5129 case SIOCGIFTXQLEN:
5130 dev_load(net, ifr.ifr_name);
3710becf 5131 rcu_read_lock();
d1b19dff 5132 ret = dev_ifsioc_locked(net, &ifr, cmd);
3710becf 5133 rcu_read_unlock();
d1b19dff
ED
5134 if (!ret) {
5135 if (colon)
5136 *colon = ':';
5137 if (copy_to_user(arg, &ifr,
5138 sizeof(struct ifreq)))
5139 ret = -EFAULT;
5140 }
5141 return ret;
1da177e4 5142
d1b19dff
ED
5143 case SIOCETHTOOL:
5144 dev_load(net, ifr.ifr_name);
5145 rtnl_lock();
5146 ret = dev_ethtool(net, &ifr);
5147 rtnl_unlock();
5148 if (!ret) {
5149 if (colon)
5150 *colon = ':';
5151 if (copy_to_user(arg, &ifr,
5152 sizeof(struct ifreq)))
5153 ret = -EFAULT;
5154 }
5155 return ret;
1da177e4 5156
d1b19dff
ED
5157 /*
5158 * These ioctl calls:
5159 * - require superuser power.
5160 * - require strict serialization.
5161 * - return a value
5162 */
5163 case SIOCGMIIPHY:
5164 case SIOCGMIIREG:
5165 case SIOCSIFNAME:
5166 if (!capable(CAP_NET_ADMIN))
5167 return -EPERM;
5168 dev_load(net, ifr.ifr_name);
5169 rtnl_lock();
5170 ret = dev_ifsioc(net, &ifr, cmd);
5171 rtnl_unlock();
5172 if (!ret) {
5173 if (colon)
5174 *colon = ':';
5175 if (copy_to_user(arg, &ifr,
5176 sizeof(struct ifreq)))
5177 ret = -EFAULT;
5178 }
5179 return ret;
1da177e4 5180
d1b19dff
ED
5181 /*
5182 * These ioctl calls:
5183 * - require superuser power.
5184 * - require strict serialization.
5185 * - do not return a value
5186 */
5187 case SIOCSIFFLAGS:
5188 case SIOCSIFMETRIC:
5189 case SIOCSIFMTU:
5190 case SIOCSIFMAP:
5191 case SIOCSIFHWADDR:
5192 case SIOCSIFSLAVE:
5193 case SIOCADDMULTI:
5194 case SIOCDELMULTI:
5195 case SIOCSIFHWBROADCAST:
5196 case SIOCSIFTXQLEN:
5197 case SIOCSMIIREG:
5198 case SIOCBONDENSLAVE:
5199 case SIOCBONDRELEASE:
5200 case SIOCBONDSETHWADDR:
5201 case SIOCBONDCHANGEACTIVE:
5202 case SIOCBRADDIF:
5203 case SIOCBRDELIF:
5204 case SIOCSHWTSTAMP:
5205 if (!capable(CAP_NET_ADMIN))
5206 return -EPERM;
5207 /* fall through */
5208 case SIOCBONDSLAVEINFOQUERY:
5209 case SIOCBONDINFOQUERY:
5210 dev_load(net, ifr.ifr_name);
5211 rtnl_lock();
5212 ret = dev_ifsioc(net, &ifr, cmd);
5213 rtnl_unlock();
5214 return ret;
5215
5216 case SIOCGIFMEM:
5217 /* Get the per device memory space. We can add this but
5218 * currently do not support it */
5219 case SIOCSIFMEM:
5220 /* Set the per device memory buffer space.
5221 * Not applicable in our case */
5222 case SIOCSIFLINK:
41c31f31 5223 return -ENOTTY;
d1b19dff
ED
5224
5225 /*
5226 * Unknown or private ioctl.
5227 */
5228 default:
5229 if (cmd == SIOCWANDEV ||
5230 (cmd >= SIOCDEVPRIVATE &&
5231 cmd <= SIOCDEVPRIVATE + 15)) {
881d966b 5232 dev_load(net, ifr.ifr_name);
1da177e4 5233 rtnl_lock();
881d966b 5234 ret = dev_ifsioc(net, &ifr, cmd);
1da177e4 5235 rtnl_unlock();
d1b19dff
ED
5236 if (!ret && copy_to_user(arg, &ifr,
5237 sizeof(struct ifreq)))
5238 ret = -EFAULT;
1da177e4 5239 return ret;
d1b19dff
ED
5240 }
5241 /* Take care of Wireless Extensions */
5242 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5243 return wext_handle_ioctl(net, &ifr, cmd, arg);
41c31f31 5244 return -ENOTTY;
1da177e4
LT
5245 }
5246}
5247
5248
5249/**
5250 * dev_new_index - allocate an ifindex
c4ea43c5 5251 * @net: the applicable net namespace
1da177e4
LT
5252 *
5253 * Returns a suitable unique value for a new device interface
5254 * number. The caller must hold the rtnl semaphore or the
5255 * dev_base_lock to be sure it remains unique.
5256 */
881d966b 5257static int dev_new_index(struct net *net)
1da177e4
LT
5258{
5259 static int ifindex;
5260 for (;;) {
5261 if (++ifindex <= 0)
5262 ifindex = 1;
881d966b 5263 if (!__dev_get_by_index(net, ifindex))
1da177e4
LT
5264 return ifindex;
5265 }
5266}
5267
1da177e4 5268/* Delayed registration/unregisteration */
3b5b34fd 5269static LIST_HEAD(net_todo_list);
1da177e4 5270
6f05f629 5271static void net_set_todo(struct net_device *dev)
1da177e4 5272{
1da177e4 5273 list_add_tail(&dev->todo_list, &net_todo_list);
1da177e4
LT
5274}
5275
9b5e383c 5276static void rollback_registered_many(struct list_head *head)
93ee31f1 5277{
e93737b0 5278 struct net_device *dev, *tmp;
9b5e383c 5279
93ee31f1
DL
5280 BUG_ON(dev_boot_phase);
5281 ASSERT_RTNL();
5282
e93737b0 5283 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 5284 /* Some devices call without registering
e93737b0
KK
5285 * for initialization unwind. Remove those
5286 * devices and proceed with the remaining.
9b5e383c
ED
5287 */
5288 if (dev->reg_state == NETREG_UNINITIALIZED) {
5289 pr_debug("unregister_netdevice: device %s/%p never "
5290 "was registered\n", dev->name, dev);
93ee31f1 5291
9b5e383c 5292 WARN_ON(1);
e93737b0
KK
5293 list_del(&dev->unreg_list);
5294 continue;
9b5e383c 5295 }
449f4544 5296 dev->dismantle = true;
9b5e383c 5297 BUG_ON(dev->reg_state != NETREG_REGISTERED);
44345724 5298 }
93ee31f1 5299
44345724
OP
5300 /* If device is running, close it first. */
5301 dev_close_many(head);
93ee31f1 5302
44345724 5303 list_for_each_entry(dev, head, unreg_list) {
9b5e383c
ED
5304 /* And unlink it from device chain. */
5305 unlist_netdevice(dev);
93ee31f1 5306
9b5e383c
ED
5307 dev->reg_state = NETREG_UNREGISTERING;
5308 }
93ee31f1
DL
5309
5310 synchronize_net();
5311
9b5e383c
ED
5312 list_for_each_entry(dev, head, unreg_list) {
5313 /* Shutdown queueing discipline. */
5314 dev_shutdown(dev);
93ee31f1
DL
5315
5316
9b5e383c
ED
5317 /* Notify protocols, that we are about to destroy
5318 this device. They should clean all the things.
5319 */
5320 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 5321
a2835763
PM
5322 if (!dev->rtnl_link_ops ||
5323 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5324 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5325
9b5e383c
ED
5326 /*
5327 * Flush the unicast and multicast chains
5328 */
a748ee24 5329 dev_uc_flush(dev);
22bedad3 5330 dev_mc_flush(dev);
93ee31f1 5331
9b5e383c
ED
5332 if (dev->netdev_ops->ndo_uninit)
5333 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 5334
9b5e383c
ED
5335 /* Notifier chain MUST detach us from master device. */
5336 WARN_ON(dev->master);
93ee31f1 5337
9b5e383c
ED
5338 /* Remove entries from kobject tree */
5339 netdev_unregister_kobject(dev);
5340 }
93ee31f1 5341
a5ee1551 5342 /* Process any work delayed until the end of the batch */
e5e26d75 5343 dev = list_first_entry(head, struct net_device, unreg_list);
a5ee1551 5344 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
93ee31f1 5345
850a545b 5346 synchronize_net();
395264d5 5347
a5ee1551 5348 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
5349 dev_put(dev);
5350}
5351
5352static void rollback_registered(struct net_device *dev)
5353{
5354 LIST_HEAD(single);
5355
5356 list_add(&dev->unreg_list, &single);
5357 rollback_registered_many(&single);
ceaaec98 5358 list_del(&single);
93ee31f1
DL
5359}
5360
fec30c33 5361static u32 netdev_fix_features(struct net_device *dev, u32 features)
b63365a2 5362{
57422dc5
MM
5363 /* Fix illegal checksum combinations */
5364 if ((features & NETIF_F_HW_CSUM) &&
5365 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5366 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
57422dc5
MM
5367 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5368 }
5369
5370 if ((features & NETIF_F_NO_CSUM) &&
5371 (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5372 netdev_warn(dev, "mixed no checksumming and other settings.\n");
57422dc5
MM
5373 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5374 }
5375
b63365a2
HX
5376 /* Fix illegal SG+CSUM combinations. */
5377 if ((features & NETIF_F_SG) &&
5378 !(features & NETIF_F_ALL_CSUM)) {
6f404e44
MM
5379 netdev_dbg(dev,
5380 "Dropping NETIF_F_SG since no checksum feature.\n");
b63365a2
HX
5381 features &= ~NETIF_F_SG;
5382 }
5383
5384 /* TSO requires that SG is present as well. */
ea2d3688 5385 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6f404e44 5386 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
ea2d3688 5387 features &= ~NETIF_F_ALL_TSO;
b63365a2
HX
5388 }
5389
31d8b9e0
BH
5390 /* TSO ECN requires that TSO is present as well. */
5391 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5392 features &= ~NETIF_F_TSO_ECN;
5393
212b573f
MM
5394 /* Software GSO depends on SG. */
5395 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6f404e44 5396 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
212b573f
MM
5397 features &= ~NETIF_F_GSO;
5398 }
5399
acd1130e 5400 /* UFO needs SG and checksumming */
b63365a2 5401 if (features & NETIF_F_UFO) {
79032644
MM
5402 /* maybe split UFO into V4 and V6? */
5403 if (!((features & NETIF_F_GEN_CSUM) ||
5404 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5405 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5406 netdev_dbg(dev,
acd1130e 5407 "Dropping NETIF_F_UFO since no checksum offload features.\n");
b63365a2
HX
5408 features &= ~NETIF_F_UFO;
5409 }
5410
5411 if (!(features & NETIF_F_SG)) {
6f404e44 5412 netdev_dbg(dev,
acd1130e 5413 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
b63365a2
HX
5414 features &= ~NETIF_F_UFO;
5415 }
5416 }
5417
5418 return features;
5419}
b63365a2 5420
6cb6a27c 5421int __netdev_update_features(struct net_device *dev)
5455c699
MM
5422{
5423 u32 features;
5424 int err = 0;
5425
87267485
MM
5426 ASSERT_RTNL();
5427
5455c699
MM
5428 features = netdev_get_wanted_features(dev);
5429
5430 if (dev->netdev_ops->ndo_fix_features)
5431 features = dev->netdev_ops->ndo_fix_features(dev, features);
5432
5433 /* driver might be less strict about feature dependencies */
5434 features = netdev_fix_features(dev, features);
5435
5436 if (dev->features == features)
6cb6a27c 5437 return 0;
5455c699 5438
604ae14f 5439 netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5455c699
MM
5440 dev->features, features);
5441
5442 if (dev->netdev_ops->ndo_set_features)
5443 err = dev->netdev_ops->ndo_set_features(dev, features);
5444
6cb6a27c 5445 if (unlikely(err < 0)) {
5455c699
MM
5446 netdev_err(dev,
5447 "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5448 err, features, dev->features);
6cb6a27c
MM
5449 return -1;
5450 }
5451
5452 if (!err)
5453 dev->features = features;
5454
5455 return 1;
5456}
5457
afe12cc8
MM
5458/**
5459 * netdev_update_features - recalculate device features
5460 * @dev: the device to check
5461 *
5462 * Recalculate dev->features set and send notifications if it
5463 * has changed. Should be called after driver or hardware dependent
5464 * conditions might have changed that influence the features.
5465 */
6cb6a27c
MM
5466void netdev_update_features(struct net_device *dev)
5467{
5468 if (__netdev_update_features(dev))
5469 netdev_features_change(dev);
5455c699
MM
5470}
5471EXPORT_SYMBOL(netdev_update_features);
5472
afe12cc8
MM
5473/**
5474 * netdev_change_features - recalculate device features
5475 * @dev: the device to check
5476 *
5477 * Recalculate dev->features set and send notifications even
5478 * if they have not changed. Should be called instead of
5479 * netdev_update_features() if also dev->vlan_features might
5480 * have changed to allow the changes to be propagated to stacked
5481 * VLAN devices.
5482 */
5483void netdev_change_features(struct net_device *dev)
5484{
5485 __netdev_update_features(dev);
5486 netdev_features_change(dev);
5487}
5488EXPORT_SYMBOL(netdev_change_features);
5489
fc4a7489
PM
5490/**
5491 * netif_stacked_transfer_operstate - transfer operstate
5492 * @rootdev: the root or lower level device to transfer state from
5493 * @dev: the device to transfer operstate to
5494 *
5495 * Transfer operational state from root to device. This is normally
5496 * called when a stacking relationship exists between the root
5497 * device and the device(a leaf device).
5498 */
5499void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5500 struct net_device *dev)
5501{
5502 if (rootdev->operstate == IF_OPER_DORMANT)
5503 netif_dormant_on(dev);
5504 else
5505 netif_dormant_off(dev);
5506
5507 if (netif_carrier_ok(rootdev)) {
5508 if (!netif_carrier_ok(dev))
5509 netif_carrier_on(dev);
5510 } else {
5511 if (netif_carrier_ok(dev))
5512 netif_carrier_off(dev);
5513 }
5514}
5515EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5516
bf264145 5517#ifdef CONFIG_RPS
1b4bf461
ED
5518static int netif_alloc_rx_queues(struct net_device *dev)
5519{
1b4bf461 5520 unsigned int i, count = dev->num_rx_queues;
bd25fa7b 5521 struct netdev_rx_queue *rx;
1b4bf461 5522
bd25fa7b 5523 BUG_ON(count < 1);
1b4bf461 5524
bd25fa7b
TH
5525 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5526 if (!rx) {
5527 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5528 return -ENOMEM;
1b4bf461 5529 }
bd25fa7b
TH
5530 dev->_rx = rx;
5531
bd25fa7b 5532 for (i = 0; i < count; i++)
fe822240 5533 rx[i].dev = dev;
1b4bf461
ED
5534 return 0;
5535}
bf264145 5536#endif
1b4bf461 5537
aa942104
CG
5538static void netdev_init_one_queue(struct net_device *dev,
5539 struct netdev_queue *queue, void *_unused)
5540{
5541 /* Initialize queue lock */
5542 spin_lock_init(&queue->_xmit_lock);
5543 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5544 queue->xmit_lock_owner = -1;
b236da69 5545 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
aa942104
CG
5546 queue->dev = dev;
5547}
5548
e6484930
TH
5549static int netif_alloc_netdev_queues(struct net_device *dev)
5550{
5551 unsigned int count = dev->num_tx_queues;
5552 struct netdev_queue *tx;
5553
5554 BUG_ON(count < 1);
5555
5556 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5557 if (!tx) {
5558 pr_err("netdev: Unable to allocate %u tx queues.\n",
5559 count);
5560 return -ENOMEM;
5561 }
5562 dev->_tx = tx;
1d24eb48 5563
e6484930
TH
5564 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5565 spin_lock_init(&dev->tx_global_lock);
aa942104
CG
5566
5567 return 0;
e6484930
TH
5568}
5569
1da177e4
LT
5570/**
5571 * register_netdevice - register a network device
5572 * @dev: device to register
5573 *
5574 * Take a completed network device structure and add it to the kernel
5575 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5576 * chain. 0 is returned on success. A negative errno code is returned
5577 * on a failure to set up the device, or if the name is a duplicate.
5578 *
5579 * Callers must hold the rtnl semaphore. You may want
5580 * register_netdev() instead of this.
5581 *
5582 * BUGS:
5583 * The locking appears insufficient to guarantee two parallel registers
5584 * will not get the same name.
5585 */
5586
5587int register_netdevice(struct net_device *dev)
5588{
1da177e4 5589 int ret;
d314774c 5590 struct net *net = dev_net(dev);
1da177e4
LT
5591
5592 BUG_ON(dev_boot_phase);
5593 ASSERT_RTNL();
5594
b17a7c17
SH
5595 might_sleep();
5596
1da177e4
LT
5597 /* When net_device's are persistent, this will be fatal. */
5598 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 5599 BUG_ON(!net);
1da177e4 5600
f1f28aa3 5601 spin_lock_init(&dev->addr_list_lock);
cf508b12 5602 netdev_set_addr_lockdep_class(dev);
1da177e4 5603
1da177e4
LT
5604 dev->iflink = -1;
5605
0696c3a8
PP
5606 ret = dev_get_valid_name(dev, dev->name);
5607 if (ret < 0)
5608 goto out;
5609
1da177e4 5610 /* Init, if this function is available */
d314774c
SH
5611 if (dev->netdev_ops->ndo_init) {
5612 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
5613 if (ret) {
5614 if (ret > 0)
5615 ret = -EIO;
90833aa4 5616 goto out;
1da177e4
LT
5617 }
5618 }
4ec93edb 5619
881d966b 5620 dev->ifindex = dev_new_index(net);
1da177e4
LT
5621 if (dev->iflink == -1)
5622 dev->iflink = dev->ifindex;
5623
5455c699
MM
5624 /* Transfer changeable features to wanted_features and enable
5625 * software offloads (GSO and GRO).
5626 */
5627 dev->hw_features |= NETIF_F_SOFT_FEATURES;
14d1232f
MM
5628 dev->features |= NETIF_F_SOFT_FEATURES;
5629 dev->wanted_features = dev->features & dev->hw_features;
1da177e4 5630
c6e1a0d1
TH
5631 /* Turn on no cache copy if HW is doing checksum */
5632 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5633 if ((dev->features & NETIF_F_ALL_CSUM) &&
5634 !(dev->features & NETIF_F_NO_CSUM)) {
5635 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5636 dev->features |= NETIF_F_NOCACHE_COPY;
5637 }
5638
1180e7d6 5639 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
16c3ea78 5640 */
1180e7d6 5641 dev->vlan_features |= NETIF_F_HIGHDMA;
16c3ea78 5642
7ffbe3fd
JB
5643 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5644 ret = notifier_to_errno(ret);
5645 if (ret)
5646 goto err_uninit;
5647
8b41d188 5648 ret = netdev_register_kobject(dev);
b17a7c17 5649 if (ret)
7ce1b0ed 5650 goto err_uninit;
b17a7c17
SH
5651 dev->reg_state = NETREG_REGISTERED;
5652
6cb6a27c 5653 __netdev_update_features(dev);
8e9b59b2 5654
1da177e4
LT
5655 /*
5656 * Default initial state at registry is that the
5657 * device is present.
5658 */
5659
5660 set_bit(__LINK_STATE_PRESENT, &dev->state);
5661
1da177e4 5662 dev_init_scheduler(dev);
1da177e4 5663 dev_hold(dev);
ce286d32 5664 list_netdevice(dev);
1da177e4
LT
5665
5666 /* Notify protocols, that a new device appeared. */
056925ab 5667 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 5668 ret = notifier_to_errno(ret);
93ee31f1
DL
5669 if (ret) {
5670 rollback_registered(dev);
5671 dev->reg_state = NETREG_UNREGISTERED;
5672 }
d90a909e
EB
5673 /*
5674 * Prevent userspace races by waiting until the network
5675 * device is fully setup before sending notifications.
5676 */
a2835763
PM
5677 if (!dev->rtnl_link_ops ||
5678 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5679 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
1da177e4
LT
5680
5681out:
5682 return ret;
7ce1b0ed
HX
5683
5684err_uninit:
d314774c
SH
5685 if (dev->netdev_ops->ndo_uninit)
5686 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 5687 goto out;
1da177e4 5688}
d1b19dff 5689EXPORT_SYMBOL(register_netdevice);
1da177e4 5690
937f1ba5
BH
5691/**
5692 * init_dummy_netdev - init a dummy network device for NAPI
5693 * @dev: device to init
5694 *
5695 * This takes a network device structure and initialize the minimum
5696 * amount of fields so it can be used to schedule NAPI polls without
5697 * registering a full blown interface. This is to be used by drivers
5698 * that need to tie several hardware interfaces to a single NAPI
5699 * poll scheduler due to HW limitations.
5700 */
5701int init_dummy_netdev(struct net_device *dev)
5702{
5703 /* Clear everything. Note we don't initialize spinlocks
5704 * are they aren't supposed to be taken by any of the
5705 * NAPI code and this dummy netdev is supposed to be
5706 * only ever used for NAPI polls
5707 */
5708 memset(dev, 0, sizeof(struct net_device));
5709
5710 /* make sure we BUG if trying to hit standard
5711 * register/unregister code path
5712 */
5713 dev->reg_state = NETREG_DUMMY;
5714
937f1ba5
BH
5715 /* NAPI wants this */
5716 INIT_LIST_HEAD(&dev->napi_list);
5717
5718 /* a dummy interface is started by default */
5719 set_bit(__LINK_STATE_PRESENT, &dev->state);
5720 set_bit(__LINK_STATE_START, &dev->state);
5721
29b4433d
ED
5722 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5723 * because users of this 'device' dont need to change
5724 * its refcount.
5725 */
5726
937f1ba5
BH
5727 return 0;
5728}
5729EXPORT_SYMBOL_GPL(init_dummy_netdev);
5730
5731
1da177e4
LT
5732/**
5733 * register_netdev - register a network device
5734 * @dev: device to register
5735 *
5736 * Take a completed network device structure and add it to the kernel
5737 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5738 * chain. 0 is returned on success. A negative errno code is returned
5739 * on a failure to set up the device, or if the name is a duplicate.
5740 *
38b4da38 5741 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
5742 * and expands the device name if you passed a format string to
5743 * alloc_netdev.
5744 */
5745int register_netdev(struct net_device *dev)
5746{
5747 int err;
5748
5749 rtnl_lock();
1da177e4 5750 err = register_netdevice(dev);
1da177e4
LT
5751 rtnl_unlock();
5752 return err;
5753}
5754EXPORT_SYMBOL(register_netdev);
5755
29b4433d
ED
5756int netdev_refcnt_read(const struct net_device *dev)
5757{
5758 int i, refcnt = 0;
5759
5760 for_each_possible_cpu(i)
5761 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5762 return refcnt;
5763}
5764EXPORT_SYMBOL(netdev_refcnt_read);
5765
1da177e4
LT
5766/*
5767 * netdev_wait_allrefs - wait until all references are gone.
5768 *
5769 * This is called when unregistering network devices.
5770 *
5771 * Any protocol or device that holds a reference should register
5772 * for netdevice notification, and cleanup and put back the
5773 * reference if they receive an UNREGISTER event.
5774 * We can get stuck here if buggy protocols don't correctly
4ec93edb 5775 * call dev_put.
1da177e4
LT
5776 */
5777static void netdev_wait_allrefs(struct net_device *dev)
5778{
5779 unsigned long rebroadcast_time, warning_time;
29b4433d 5780 int refcnt;
1da177e4 5781
e014debe
ED
5782 linkwatch_forget_dev(dev);
5783
1da177e4 5784 rebroadcast_time = warning_time = jiffies;
29b4433d
ED
5785 refcnt = netdev_refcnt_read(dev);
5786
5787 while (refcnt != 0) {
1da177e4 5788 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 5789 rtnl_lock();
1da177e4
LT
5790
5791 /* Rebroadcast unregister notification */
056925ab 5792 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 5793 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
395264d5 5794 * should have already handle it the first time */
1da177e4
LT
5795
5796 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5797 &dev->state)) {
5798 /* We must not have linkwatch events
5799 * pending on unregister. If this
5800 * happens, we simply run the queue
5801 * unscheduled, resulting in a noop
5802 * for this device.
5803 */
5804 linkwatch_run_queue();
5805 }
5806
6756ae4b 5807 __rtnl_unlock();
1da177e4
LT
5808
5809 rebroadcast_time = jiffies;
5810 }
5811
5812 msleep(250);
5813
29b4433d
ED
5814 refcnt = netdev_refcnt_read(dev);
5815
1da177e4
LT
5816 if (time_after(jiffies, warning_time + 10 * HZ)) {
5817 printk(KERN_EMERG "unregister_netdevice: "
5818 "waiting for %s to become free. Usage "
5819 "count = %d\n",
29b4433d 5820 dev->name, refcnt);
1da177e4
LT
5821 warning_time = jiffies;
5822 }
5823 }
5824}
5825
5826/* The sequence is:
5827 *
5828 * rtnl_lock();
5829 * ...
5830 * register_netdevice(x1);
5831 * register_netdevice(x2);
5832 * ...
5833 * unregister_netdevice(y1);
5834 * unregister_netdevice(y2);
5835 * ...
5836 * rtnl_unlock();
5837 * free_netdev(y1);
5838 * free_netdev(y2);
5839 *
58ec3b4d 5840 * We are invoked by rtnl_unlock().
1da177e4 5841 * This allows us to deal with problems:
b17a7c17 5842 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
5843 * without deadlocking with linkwatch via keventd.
5844 * 2) Since we run with the RTNL semaphore not held, we can sleep
5845 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
5846 *
5847 * We must not return until all unregister events added during
5848 * the interval the lock was held have been completed.
1da177e4 5849 */
1da177e4
LT
5850void netdev_run_todo(void)
5851{
626ab0e6 5852 struct list_head list;
1da177e4 5853
1da177e4 5854 /* Snapshot list, allow later requests */
626ab0e6 5855 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
5856
5857 __rtnl_unlock();
626ab0e6 5858
850a545b
EB
5859 /* Wait for rcu callbacks to finish before attempting to drain
5860 * the device list. This usually avoids a 250ms wait.
5861 */
5862 if (!list_empty(&list))
5863 rcu_barrier();
5864
1da177e4
LT
5865 while (!list_empty(&list)) {
5866 struct net_device *dev
e5e26d75 5867 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
5868 list_del(&dev->todo_list);
5869
b17a7c17
SH
5870 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5871 printk(KERN_ERR "network todo '%s' but state %d\n",
5872 dev->name, dev->reg_state);
5873 dump_stack();
5874 continue;
5875 }
1da177e4 5876
b17a7c17 5877 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 5878
152102c7 5879 on_each_cpu(flush_backlog, dev, 1);
6e583ce5 5880
b17a7c17 5881 netdev_wait_allrefs(dev);
1da177e4 5882
b17a7c17 5883 /* paranoia */
29b4433d 5884 BUG_ON(netdev_refcnt_read(dev));
33d480ce
ED
5885 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5886 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
547b792c 5887 WARN_ON(dev->dn_ptr);
1da177e4 5888
b17a7c17
SH
5889 if (dev->destructor)
5890 dev->destructor(dev);
9093bbb2
SH
5891
5892 /* Free network device */
5893 kobject_put(&dev->dev.kobj);
1da177e4 5894 }
1da177e4
LT
5895}
5896
3cfde79c
BH
5897/* Convert net_device_stats to rtnl_link_stats64. They have the same
5898 * fields in the same order, with only the type differing.
5899 */
5900static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5901 const struct net_device_stats *netdev_stats)
5902{
5903#if BITS_PER_LONG == 64
5904 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5905 memcpy(stats64, netdev_stats, sizeof(*stats64));
5906#else
5907 size_t i, n = sizeof(*stats64) / sizeof(u64);
5908 const unsigned long *src = (const unsigned long *)netdev_stats;
5909 u64 *dst = (u64 *)stats64;
5910
5911 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5912 sizeof(*stats64) / sizeof(u64));
5913 for (i = 0; i < n; i++)
5914 dst[i] = src[i];
5915#endif
5916}
5917
eeda3fd6
SH
5918/**
5919 * dev_get_stats - get network device statistics
5920 * @dev: device to get statistics from
28172739 5921 * @storage: place to store stats
eeda3fd6 5922 *
d7753516
BH
5923 * Get network statistics from device. Return @storage.
5924 * The device driver may provide its own method by setting
5925 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5926 * otherwise the internal statistics structure is used.
eeda3fd6 5927 */
d7753516
BH
5928struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5929 struct rtnl_link_stats64 *storage)
7004bf25 5930{
eeda3fd6
SH
5931 const struct net_device_ops *ops = dev->netdev_ops;
5932
28172739
ED
5933 if (ops->ndo_get_stats64) {
5934 memset(storage, 0, sizeof(*storage));
caf586e5
ED
5935 ops->ndo_get_stats64(dev, storage);
5936 } else if (ops->ndo_get_stats) {
3cfde79c 5937 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
caf586e5
ED
5938 } else {
5939 netdev_stats_to_stats64(storage, &dev->stats);
28172739 5940 }
caf586e5 5941 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
28172739 5942 return storage;
c45d286e 5943}
eeda3fd6 5944EXPORT_SYMBOL(dev_get_stats);
c45d286e 5945
24824a09 5946struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
dc2b4847 5947{
24824a09 5948 struct netdev_queue *queue = dev_ingress_queue(dev);
dc2b4847 5949
24824a09
ED
5950#ifdef CONFIG_NET_CLS_ACT
5951 if (queue)
5952 return queue;
5953 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5954 if (!queue)
5955 return NULL;
5956 netdev_init_one_queue(dev, queue, NULL);
24824a09
ED
5957 queue->qdisc = &noop_qdisc;
5958 queue->qdisc_sleeping = &noop_qdisc;
5959 rcu_assign_pointer(dev->ingress_queue, queue);
5960#endif
5961 return queue;
bb949fbd
DM
5962}
5963
1da177e4 5964/**
36909ea4 5965 * alloc_netdev_mqs - allocate network device
1da177e4
LT
5966 * @sizeof_priv: size of private data to allocate space for
5967 * @name: device name format string
5968 * @setup: callback to initialize device
36909ea4
TH
5969 * @txqs: the number of TX subqueues to allocate
5970 * @rxqs: the number of RX subqueues to allocate
1da177e4
LT
5971 *
5972 * Allocates a struct net_device with private data area for driver use
f25f4e44 5973 * and performs basic initialization. Also allocates subquue structs
36909ea4 5974 * for each queue on the device.
1da177e4 5975 */
36909ea4
TH
5976struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5977 void (*setup)(struct net_device *),
5978 unsigned int txqs, unsigned int rxqs)
1da177e4 5979{
1da177e4 5980 struct net_device *dev;
7943986c 5981 size_t alloc_size;
1ce8e7b5 5982 struct net_device *p;
1da177e4 5983
b6fe17d6
SH
5984 BUG_ON(strlen(name) >= sizeof(dev->name));
5985
36909ea4 5986 if (txqs < 1) {
55513fb4
TH
5987 pr_err("alloc_netdev: Unable to allocate device "
5988 "with zero queues.\n");
5989 return NULL;
5990 }
5991
36909ea4
TH
5992#ifdef CONFIG_RPS
5993 if (rxqs < 1) {
5994 pr_err("alloc_netdev: Unable to allocate device "
5995 "with zero RX queues.\n");
5996 return NULL;
5997 }
5998#endif
5999
fd2ea0a7 6000 alloc_size = sizeof(struct net_device);
d1643d24
AD
6001 if (sizeof_priv) {
6002 /* ensure 32-byte alignment of private area */
1ce8e7b5 6003 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
6004 alloc_size += sizeof_priv;
6005 }
6006 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 6007 alloc_size += NETDEV_ALIGN - 1;
1da177e4 6008
31380de9 6009 p = kzalloc(alloc_size, GFP_KERNEL);
1da177e4 6010 if (!p) {
b6fe17d6 6011 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
1da177e4
LT
6012 return NULL;
6013 }
1da177e4 6014
1ce8e7b5 6015 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 6016 dev->padded = (char *)dev - (char *)p;
ab9c73cc 6017
29b4433d
ED
6018 dev->pcpu_refcnt = alloc_percpu(int);
6019 if (!dev->pcpu_refcnt)
e6484930 6020 goto free_p;
ab9c73cc 6021
ab9c73cc 6022 if (dev_addr_init(dev))
29b4433d 6023 goto free_pcpu;
ab9c73cc 6024
22bedad3 6025 dev_mc_init(dev);
a748ee24 6026 dev_uc_init(dev);
ccffad25 6027
c346dca1 6028 dev_net_set(dev, &init_net);
1da177e4 6029
8d3bdbd5
DM
6030 dev->gso_max_size = GSO_MAX_SIZE;
6031
8d3bdbd5
DM
6032 INIT_LIST_HEAD(&dev->napi_list);
6033 INIT_LIST_HEAD(&dev->unreg_list);
6034 INIT_LIST_HEAD(&dev->link_watch_list);
6035 dev->priv_flags = IFF_XMIT_DST_RELEASE;
6036 setup(dev);
6037
36909ea4
TH
6038 dev->num_tx_queues = txqs;
6039 dev->real_num_tx_queues = txqs;
ed9af2e8 6040 if (netif_alloc_netdev_queues(dev))
8d3bdbd5 6041 goto free_all;
e8a0464c 6042
df334545 6043#ifdef CONFIG_RPS
36909ea4
TH
6044 dev->num_rx_queues = rxqs;
6045 dev->real_num_rx_queues = rxqs;
fe822240 6046 if (netif_alloc_rx_queues(dev))
8d3bdbd5 6047 goto free_all;
df334545 6048#endif
0a9627f2 6049
1da177e4 6050 strcpy(dev->name, name);
cbda10fa 6051 dev->group = INIT_NETDEV_GROUP;
1da177e4 6052 return dev;
ab9c73cc 6053
8d3bdbd5
DM
6054free_all:
6055 free_netdev(dev);
6056 return NULL;
6057
29b4433d
ED
6058free_pcpu:
6059 free_percpu(dev->pcpu_refcnt);
ed9af2e8 6060 kfree(dev->_tx);
fe822240
TH
6061#ifdef CONFIG_RPS
6062 kfree(dev->_rx);
6063#endif
6064
ab9c73cc
JP
6065free_p:
6066 kfree(p);
6067 return NULL;
1da177e4 6068}
36909ea4 6069EXPORT_SYMBOL(alloc_netdev_mqs);
1da177e4
LT
6070
6071/**
6072 * free_netdev - free network device
6073 * @dev: device
6074 *
4ec93edb
YH
6075 * This function does the last stage of destroying an allocated device
6076 * interface. The reference to the device object is released.
1da177e4
LT
6077 * If this is the last reference then it will be freed.
6078 */
6079void free_netdev(struct net_device *dev)
6080{
d565b0a1
HX
6081 struct napi_struct *p, *n;
6082
f3005d7f
DL
6083 release_net(dev_net(dev));
6084
e8a0464c 6085 kfree(dev->_tx);
fe822240
TH
6086#ifdef CONFIG_RPS
6087 kfree(dev->_rx);
6088#endif
e8a0464c 6089
33d480ce 6090 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
24824a09 6091
f001fde5
JP
6092 /* Flush device addresses */
6093 dev_addr_flush(dev);
6094
d565b0a1
HX
6095 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6096 netif_napi_del(p);
6097
29b4433d
ED
6098 free_percpu(dev->pcpu_refcnt);
6099 dev->pcpu_refcnt = NULL;
6100
3041a069 6101 /* Compatibility with error handling in drivers */
1da177e4
LT
6102 if (dev->reg_state == NETREG_UNINITIALIZED) {
6103 kfree((char *)dev - dev->padded);
6104 return;
6105 }
6106
6107 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6108 dev->reg_state = NETREG_RELEASED;
6109
43cb76d9
GKH
6110 /* will free via device release */
6111 put_device(&dev->dev);
1da177e4 6112}
d1b19dff 6113EXPORT_SYMBOL(free_netdev);
4ec93edb 6114
f0db275a
SH
6115/**
6116 * synchronize_net - Synchronize with packet receive processing
6117 *
6118 * Wait for packets currently being received to be done.
6119 * Does not block later packets from starting.
6120 */
4ec93edb 6121void synchronize_net(void)
1da177e4
LT
6122{
6123 might_sleep();
be3fc413
ED
6124 if (rtnl_is_locked())
6125 synchronize_rcu_expedited();
6126 else
6127 synchronize_rcu();
1da177e4 6128}
d1b19dff 6129EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
6130
6131/**
44a0873d 6132 * unregister_netdevice_queue - remove device from the kernel
1da177e4 6133 * @dev: device
44a0873d 6134 * @head: list
6ebfbc06 6135 *
1da177e4 6136 * This function shuts down a device interface and removes it
d59b54b1 6137 * from the kernel tables.
44a0873d 6138 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
6139 *
6140 * Callers must hold the rtnl semaphore. You may want
6141 * unregister_netdev() instead of this.
6142 */
6143
44a0873d 6144void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 6145{
a6620712
HX
6146 ASSERT_RTNL();
6147
44a0873d 6148 if (head) {
9fdce099 6149 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
6150 } else {
6151 rollback_registered(dev);
6152 /* Finish processing unregister after unlock */
6153 net_set_todo(dev);
6154 }
1da177e4 6155}
44a0873d 6156EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 6157
9b5e383c
ED
6158/**
6159 * unregister_netdevice_many - unregister many devices
6160 * @head: list of devices
9b5e383c
ED
6161 */
6162void unregister_netdevice_many(struct list_head *head)
6163{
6164 struct net_device *dev;
6165
6166 if (!list_empty(head)) {
6167 rollback_registered_many(head);
6168 list_for_each_entry(dev, head, unreg_list)
6169 net_set_todo(dev);
6170 }
6171}
63c8099d 6172EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 6173
1da177e4
LT
6174/**
6175 * unregister_netdev - remove device from the kernel
6176 * @dev: device
6177 *
6178 * This function shuts down a device interface and removes it
d59b54b1 6179 * from the kernel tables.
1da177e4
LT
6180 *
6181 * This is just a wrapper for unregister_netdevice that takes
6182 * the rtnl semaphore. In general you want to use this and not
6183 * unregister_netdevice.
6184 */
6185void unregister_netdev(struct net_device *dev)
6186{
6187 rtnl_lock();
6188 unregister_netdevice(dev);
6189 rtnl_unlock();
6190}
1da177e4
LT
6191EXPORT_SYMBOL(unregister_netdev);
6192
ce286d32
EB
6193/**
6194 * dev_change_net_namespace - move device to different nethost namespace
6195 * @dev: device
6196 * @net: network namespace
6197 * @pat: If not NULL name pattern to try if the current device name
6198 * is already taken in the destination network namespace.
6199 *
6200 * This function shuts down a device interface and moves it
6201 * to a new network namespace. On success 0 is returned, on
6202 * a failure a netagive errno code is returned.
6203 *
6204 * Callers must hold the rtnl semaphore.
6205 */
6206
6207int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6208{
ce286d32
EB
6209 int err;
6210
6211 ASSERT_RTNL();
6212
6213 /* Don't allow namespace local devices to be moved. */
6214 err = -EINVAL;
6215 if (dev->features & NETIF_F_NETNS_LOCAL)
6216 goto out;
6217
6218 /* Ensure the device has been registrered */
6219 err = -EINVAL;
6220 if (dev->reg_state != NETREG_REGISTERED)
6221 goto out;
6222
6223 /* Get out if there is nothing todo */
6224 err = 0;
878628fb 6225 if (net_eq(dev_net(dev), net))
ce286d32
EB
6226 goto out;
6227
6228 /* Pick the destination device name, and ensure
6229 * we can use it in the destination network namespace.
6230 */
6231 err = -EEXIST;
d9031024 6232 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
6233 /* We get here if we can't use the current device name */
6234 if (!pat)
6235 goto out;
1c5cae81 6236 if (dev_get_valid_name(dev, pat) < 0)
ce286d32
EB
6237 goto out;
6238 }
6239
6240 /*
6241 * And now a mini version of register_netdevice unregister_netdevice.
6242 */
6243
6244 /* If device is running close it first. */
9b772652 6245 dev_close(dev);
ce286d32
EB
6246
6247 /* And unlink it from device chain */
6248 err = -ENODEV;
6249 unlist_netdevice(dev);
6250
6251 synchronize_net();
6252
6253 /* Shutdown queueing discipline. */
6254 dev_shutdown(dev);
6255
6256 /* Notify protocols, that we are about to destroy
6257 this device. They should clean all the things.
3b27e105
DL
6258
6259 Note that dev->reg_state stays at NETREG_REGISTERED.
6260 This is wanted because this way 8021q and macvlan know
6261 the device is just moving and can keep their slaves up.
ce286d32
EB
6262 */
6263 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 6264 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
d2237d35 6265 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
ce286d32
EB
6266
6267 /*
6268 * Flush the unicast and multicast chains
6269 */
a748ee24 6270 dev_uc_flush(dev);
22bedad3 6271 dev_mc_flush(dev);
ce286d32
EB
6272
6273 /* Actually switch the network namespace */
c346dca1 6274 dev_net_set(dev, net);
ce286d32 6275
ce286d32
EB
6276 /* If there is an ifindex conflict assign a new one */
6277 if (__dev_get_by_index(net, dev->ifindex)) {
6278 int iflink = (dev->iflink == dev->ifindex);
6279 dev->ifindex = dev_new_index(net);
6280 if (iflink)
6281 dev->iflink = dev->ifindex;
6282 }
6283
8b41d188 6284 /* Fixup kobjects */
a1b3f594 6285 err = device_rename(&dev->dev, dev->name);
8b41d188 6286 WARN_ON(err);
ce286d32
EB
6287
6288 /* Add the device back in the hashes */
6289 list_netdevice(dev);
6290
6291 /* Notify protocols, that a new device appeared. */
6292 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6293
d90a909e
EB
6294 /*
6295 * Prevent userspace races by waiting until the network
6296 * device is fully setup before sending notifications.
6297 */
6298 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6299
ce286d32
EB
6300 synchronize_net();
6301 err = 0;
6302out:
6303 return err;
6304}
463d0183 6305EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 6306
1da177e4
LT
6307static int dev_cpu_callback(struct notifier_block *nfb,
6308 unsigned long action,
6309 void *ocpu)
6310{
6311 struct sk_buff **list_skb;
1da177e4
LT
6312 struct sk_buff *skb;
6313 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6314 struct softnet_data *sd, *oldsd;
6315
8bb78442 6316 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
6317 return NOTIFY_OK;
6318
6319 local_irq_disable();
6320 cpu = smp_processor_id();
6321 sd = &per_cpu(softnet_data, cpu);
6322 oldsd = &per_cpu(softnet_data, oldcpu);
6323
6324 /* Find end of our completion_queue. */
6325 list_skb = &sd->completion_queue;
6326 while (*list_skb)
6327 list_skb = &(*list_skb)->next;
6328 /* Append completion queue from offline CPU. */
6329 *list_skb = oldsd->completion_queue;
6330 oldsd->completion_queue = NULL;
6331
1da177e4 6332 /* Append output queue from offline CPU. */
a9cbd588
CG
6333 if (oldsd->output_queue) {
6334 *sd->output_queue_tailp = oldsd->output_queue;
6335 sd->output_queue_tailp = oldsd->output_queue_tailp;
6336 oldsd->output_queue = NULL;
6337 oldsd->output_queue_tailp = &oldsd->output_queue;
6338 }
264524d5
HC
6339 /* Append NAPI poll list from offline CPU. */
6340 if (!list_empty(&oldsd->poll_list)) {
6341 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6342 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6343 }
1da177e4
LT
6344
6345 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6346 local_irq_enable();
6347
6348 /* Process offline CPU's input_pkt_queue */
76cc8b13 6349 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
1da177e4 6350 netif_rx(skb);
76cc8b13 6351 input_queue_head_incr(oldsd);
fec5e652 6352 }
76cc8b13 6353 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6e7676c1 6354 netif_rx(skb);
76cc8b13
TH
6355 input_queue_head_incr(oldsd);
6356 }
1da177e4
LT
6357
6358 return NOTIFY_OK;
6359}
1da177e4
LT
6360
6361
7f353bf2 6362/**
b63365a2
HX
6363 * netdev_increment_features - increment feature set by one
6364 * @all: current feature set
6365 * @one: new feature set
6366 * @mask: mask feature set
7f353bf2
HX
6367 *
6368 * Computes a new feature set after adding a device with feature set
b63365a2
HX
6369 * @one to the master device with current feature set @all. Will not
6370 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 6371 */
04ed3e74 6372u32 netdev_increment_features(u32 all, u32 one, u32 mask)
b63365a2 6373{
1742f183
MM
6374 if (mask & NETIF_F_GEN_CSUM)
6375 mask |= NETIF_F_ALL_CSUM;
6376 mask |= NETIF_F_VLAN_CHALLENGED;
7f353bf2 6377
1742f183
MM
6378 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6379 all &= one | ~NETIF_F_ALL_FOR_ALL;
c6e1a0d1 6380
1742f183
MM
6381 /* If device needs checksumming, downgrade to it. */
6382 if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6383 all &= ~NETIF_F_NO_CSUM;
7f353bf2 6384
1742f183
MM
6385 /* If one device supports hw checksumming, set for all. */
6386 if (all & NETIF_F_GEN_CSUM)
6387 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7f353bf2
HX
6388
6389 return all;
6390}
b63365a2 6391EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 6392
30d97d35
PE
6393static struct hlist_head *netdev_create_hash(void)
6394{
6395 int i;
6396 struct hlist_head *hash;
6397
6398 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6399 if (hash != NULL)
6400 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6401 INIT_HLIST_HEAD(&hash[i]);
6402
6403 return hash;
6404}
6405
881d966b 6406/* Initialize per network namespace state */
4665079c 6407static int __net_init netdev_init(struct net *net)
881d966b 6408{
881d966b 6409 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 6410
30d97d35
PE
6411 net->dev_name_head = netdev_create_hash();
6412 if (net->dev_name_head == NULL)
6413 goto err_name;
881d966b 6414
30d97d35
PE
6415 net->dev_index_head = netdev_create_hash();
6416 if (net->dev_index_head == NULL)
6417 goto err_idx;
881d966b
EB
6418
6419 return 0;
30d97d35
PE
6420
6421err_idx:
6422 kfree(net->dev_name_head);
6423err_name:
6424 return -ENOMEM;
881d966b
EB
6425}
6426
f0db275a
SH
6427/**
6428 * netdev_drivername - network driver for the device
6429 * @dev: network device
f0db275a
SH
6430 *
6431 * Determine network driver for device.
6432 */
3019de12 6433const char *netdev_drivername(const struct net_device *dev)
6579e57b 6434{
cf04a4c7
SH
6435 const struct device_driver *driver;
6436 const struct device *parent;
3019de12 6437 const char *empty = "";
6579e57b
AV
6438
6439 parent = dev->dev.parent;
6579e57b 6440 if (!parent)
3019de12 6441 return empty;
6579e57b
AV
6442
6443 driver = parent->driver;
6444 if (driver && driver->name)
3019de12
DM
6445 return driver->name;
6446 return empty;
6579e57b
AV
6447}
6448
ffa10cb4 6449int __netdev_printk(const char *level, const struct net_device *dev,
256df2f3
JP
6450 struct va_format *vaf)
6451{
6452 int r;
6453
6454 if (dev && dev->dev.parent)
6455 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6456 netdev_name(dev), vaf);
6457 else if (dev)
6458 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6459 else
6460 r = printk("%s(NULL net_device): %pV", level, vaf);
6461
6462 return r;
6463}
ffa10cb4 6464EXPORT_SYMBOL(__netdev_printk);
256df2f3
JP
6465
6466int netdev_printk(const char *level, const struct net_device *dev,
6467 const char *format, ...)
6468{
6469 struct va_format vaf;
6470 va_list args;
6471 int r;
6472
6473 va_start(args, format);
6474
6475 vaf.fmt = format;
6476 vaf.va = &args;
6477
6478 r = __netdev_printk(level, dev, &vaf);
6479 va_end(args);
6480
6481 return r;
6482}
6483EXPORT_SYMBOL(netdev_printk);
6484
6485#define define_netdev_printk_level(func, level) \
6486int func(const struct net_device *dev, const char *fmt, ...) \
6487{ \
6488 int r; \
6489 struct va_format vaf; \
6490 va_list args; \
6491 \
6492 va_start(args, fmt); \
6493 \
6494 vaf.fmt = fmt; \
6495 vaf.va = &args; \
6496 \
6497 r = __netdev_printk(level, dev, &vaf); \
6498 va_end(args); \
6499 \
6500 return r; \
6501} \
6502EXPORT_SYMBOL(func);
6503
6504define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6505define_netdev_printk_level(netdev_alert, KERN_ALERT);
6506define_netdev_printk_level(netdev_crit, KERN_CRIT);
6507define_netdev_printk_level(netdev_err, KERN_ERR);
6508define_netdev_printk_level(netdev_warn, KERN_WARNING);
6509define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6510define_netdev_printk_level(netdev_info, KERN_INFO);
6511
4665079c 6512static void __net_exit netdev_exit(struct net *net)
881d966b
EB
6513{
6514 kfree(net->dev_name_head);
6515 kfree(net->dev_index_head);
6516}
6517
022cbae6 6518static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
6519 .init = netdev_init,
6520 .exit = netdev_exit,
6521};
6522
4665079c 6523static void __net_exit default_device_exit(struct net *net)
ce286d32 6524{
e008b5fc 6525 struct net_device *dev, *aux;
ce286d32 6526 /*
e008b5fc 6527 * Push all migratable network devices back to the
ce286d32
EB
6528 * initial network namespace
6529 */
6530 rtnl_lock();
e008b5fc 6531 for_each_netdev_safe(net, dev, aux) {
ce286d32 6532 int err;
aca51397 6533 char fb_name[IFNAMSIZ];
ce286d32
EB
6534
6535 /* Ignore unmoveable devices (i.e. loopback) */
6536 if (dev->features & NETIF_F_NETNS_LOCAL)
6537 continue;
6538
e008b5fc
EB
6539 /* Leave virtual devices for the generic cleanup */
6540 if (dev->rtnl_link_ops)
6541 continue;
d0c082ce 6542
25985edc 6543 /* Push remaining network devices to init_net */
aca51397
PE
6544 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6545 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 6546 if (err) {
aca51397 6547 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
ce286d32 6548 __func__, dev->name, err);
aca51397 6549 BUG();
ce286d32
EB
6550 }
6551 }
6552 rtnl_unlock();
6553}
6554
04dc7f6b
EB
6555static void __net_exit default_device_exit_batch(struct list_head *net_list)
6556{
6557 /* At exit all network devices most be removed from a network
b595076a 6558 * namespace. Do this in the reverse order of registration.
04dc7f6b
EB
6559 * Do this across as many network namespaces as possible to
6560 * improve batching efficiency.
6561 */
6562 struct net_device *dev;
6563 struct net *net;
6564 LIST_HEAD(dev_kill_list);
6565
6566 rtnl_lock();
6567 list_for_each_entry(net, net_list, exit_list) {
6568 for_each_netdev_reverse(net, dev) {
6569 if (dev->rtnl_link_ops)
6570 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6571 else
6572 unregister_netdevice_queue(dev, &dev_kill_list);
6573 }
6574 }
6575 unregister_netdevice_many(&dev_kill_list);
ceaaec98 6576 list_del(&dev_kill_list);
04dc7f6b
EB
6577 rtnl_unlock();
6578}
6579
022cbae6 6580static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 6581 .exit = default_device_exit,
04dc7f6b 6582 .exit_batch = default_device_exit_batch,
ce286d32
EB
6583};
6584
1da177e4
LT
6585/*
6586 * Initialize the DEV module. At boot time this walks the device list and
6587 * unhooks any devices that fail to initialise (normally hardware not
6588 * present) and leaves us with a valid list of present and active devices.
6589 *
6590 */
6591
6592/*
6593 * This is called single threaded during boot, so no need
6594 * to take the rtnl semaphore.
6595 */
6596static int __init net_dev_init(void)
6597{
6598 int i, rc = -ENOMEM;
6599
6600 BUG_ON(!dev_boot_phase);
6601
1da177e4
LT
6602 if (dev_proc_init())
6603 goto out;
6604
8b41d188 6605 if (netdev_kobject_init())
1da177e4
LT
6606 goto out;
6607
6608 INIT_LIST_HEAD(&ptype_all);
82d8a867 6609 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
6610 INIT_LIST_HEAD(&ptype_base[i]);
6611
881d966b
EB
6612 if (register_pernet_subsys(&netdev_net_ops))
6613 goto out;
1da177e4
LT
6614
6615 /*
6616 * Initialise the packet receive queues.
6617 */
6618
6f912042 6619 for_each_possible_cpu(i) {
e36fa2f7 6620 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 6621
dee42870 6622 memset(sd, 0, sizeof(*sd));
e36fa2f7 6623 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 6624 skb_queue_head_init(&sd->process_queue);
e36fa2f7
ED
6625 sd->completion_queue = NULL;
6626 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588
CG
6627 sd->output_queue = NULL;
6628 sd->output_queue_tailp = &sd->output_queue;
df334545 6629#ifdef CONFIG_RPS
e36fa2f7
ED
6630 sd->csd.func = rps_trigger_softirq;
6631 sd->csd.info = sd;
6632 sd->csd.flags = 0;
6633 sd->cpu = i;
1e94d72f 6634#endif
0a9627f2 6635
e36fa2f7
ED
6636 sd->backlog.poll = process_backlog;
6637 sd->backlog.weight = weight_p;
6638 sd->backlog.gro_list = NULL;
6639 sd->backlog.gro_count = 0;
1da177e4
LT
6640 }
6641
1da177e4
LT
6642 dev_boot_phase = 0;
6643
505d4f73
EB
6644 /* The loopback device is special if any other network devices
6645 * is present in a network namespace the loopback device must
6646 * be present. Since we now dynamically allocate and free the
6647 * loopback device ensure this invariant is maintained by
6648 * keeping the loopback device as the first device on the
6649 * list of network devices. Ensuring the loopback devices
6650 * is the first device that appears and the last network device
6651 * that disappears.
6652 */
6653 if (register_pernet_device(&loopback_net_ops))
6654 goto out;
6655
6656 if (register_pernet_device(&default_device_ops))
6657 goto out;
6658
962cf36c
CM
6659 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6660 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
6661
6662 hotcpu_notifier(dev_cpu_callback, 0);
6663 dst_init();
6664 dev_mcast_init();
6665 rc = 0;
6666out:
6667 return rc;
6668}
6669
6670subsys_initcall(net_dev_init);
6671
e88721f8
KK
6672static int __init initialize_hashrnd(void)
6673{
0a9627f2 6674 get_random_bytes(&hashrnd, sizeof(hashrnd));
e88721f8
KK
6675 return 0;
6676}
6677
6678late_initcall_sync(initialize_hashrnd);
6679
This page took 1.876985 seconds and 5 git commands to generate.