mlx4_en: Fix out of bounds array access
[deliverable/linux.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
4fc268d2 78#include <linux/capability.h>
1da177e4
LT
79#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
08e9897d 82#include <linux/hash.h>
5a0e3ad6 83#include <linux/slab.h>
1da177e4 84#include <linux/sched.h>
4a3e2f71 85#include <linux/mutex.h>
1da177e4
LT
86#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
0187bdfb 95#include <linux/ethtool.h>
1da177e4
LT
96#include <linux/notifier.h>
97#include <linux/skbuff.h>
457c4cbc 98#include <net/net_namespace.h>
1da177e4
LT
99#include <net/sock.h>
100#include <linux/rtnetlink.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/stat.h>
1da177e4
LT
104#include <net/dst.h>
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
44540960 107#include <net/xfrm.h>
1da177e4
LT
108#include <linux/highmem.h>
109#include <linux/init.h>
110#include <linux/kmod.h>
111#include <linux/module.h>
1da177e4
LT
112#include <linux/netpoll.h>
113#include <linux/rcupdate.h>
114#include <linux/delay.h>
295f4a1f 115#include <net/wext.h>
1da177e4 116#include <net/iw_handler.h>
1da177e4 117#include <asm/current.h>
5bdb9886 118#include <linux/audit.h>
db217334 119#include <linux/dmaengine.h>
f6a78bfc 120#include <linux/err.h>
c7fa9d18 121#include <linux/ctype.h>
723e98b7 122#include <linux/if_arp.h>
6de329e2 123#include <linux/if_vlan.h>
8f0f2223 124#include <linux/ip.h>
ad55dcaf 125#include <net/ip.h>
8f0f2223
DM
126#include <linux/ipv6.h>
127#include <linux/in.h>
b6b2fed1
DM
128#include <linux/jhash.h>
129#include <linux/random.h>
9cbc1cb8 130#include <trace/events/napi.h>
cf66ba58 131#include <trace/events/net.h>
07dc22e7 132#include <trace/events/skb.h>
5acbbd42 133#include <linux/pci.h>
caeda9b9 134#include <linux/inetdevice.h>
1da177e4 135
342709ef
PE
136#include "net-sysfs.h"
137
d565b0a1
HX
138/* Instead of increasing this, you should create a hash table. */
139#define MAX_GRO_SKBS 8
140
5d38a079
HX
141/* This should be increased if a protocol with a bigger head is added. */
142#define GRO_MAX_HEAD (MAX_HEADER + 128)
143
1da177e4
LT
144/*
145 * The list of packet types we will receive (as opposed to discard)
146 * and the routines to invoke.
147 *
148 * Why 16. Because with 16 the only overlap we get on a hash of the
149 * low nibble of the protocol value is RARP/SNAP/X.25.
150 *
151 * NOTE: That is no longer true with the addition of VLAN tags. Not
152 * sure which should go first, but I bet it won't make much
153 * difference if we are running VLANs. The good news is that
154 * this protocol won't be in the list unless compiled in, so
3041a069 155 * the average user (w/out VLANs) will not be adversely affected.
1da177e4
LT
156 * --BLG
157 *
158 * 0800 IP
159 * 8100 802.1Q VLAN
160 * 0001 802.3
161 * 0002 AX.25
162 * 0004 802.2
163 * 8035 RARP
164 * 0005 SNAP
165 * 0805 X.25
166 * 0806 ARP
167 * 8137 IPX
168 * 0009 Localtalk
169 * 86DD IPv6
170 */
171
82d8a867
PE
172#define PTYPE_HASH_SIZE (16)
173#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
174
1da177e4 175static DEFINE_SPINLOCK(ptype_lock);
82d8a867 176static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
6b2bedc3 177static struct list_head ptype_all __read_mostly; /* Taps */
1da177e4 178
1da177e4 179/*
7562f876 180 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
181 * semaphore.
182 *
c6d14c84 183 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
184 *
185 * Writers must hold the rtnl semaphore while they loop through the
7562f876 186 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
187 * actual updates. This allows pure readers to access the list even
188 * while a writer is preparing to update it.
189 *
190 * To put it another way, dev_base_lock is held for writing only to
191 * protect against pure readers; the rtnl semaphore provides the
192 * protection against other writers.
193 *
194 * See, for example usages, register_netdevice() and
195 * unregister_netdevice(), which must be called with the rtnl
196 * semaphore held.
197 */
1da177e4 198DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
199EXPORT_SYMBOL(dev_base_lock);
200
881d966b 201static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4
LT
202{
203 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
08e9897d 204 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
205}
206
881d966b 207static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 208{
7c28bd0b 209 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
210}
211
e36fa2f7 212static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
213{
214#ifdef CONFIG_RPS
e36fa2f7 215 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
216#endif
217}
218
e36fa2f7 219static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
220{
221#ifdef CONFIG_RPS
e36fa2f7 222 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
223#endif
224}
225
ce286d32
EB
226/* Device list insertion */
227static int list_netdevice(struct net_device *dev)
228{
c346dca1 229 struct net *net = dev_net(dev);
ce286d32
EB
230
231 ASSERT_RTNL();
232
233 write_lock_bh(&dev_base_lock);
c6d14c84 234 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 235 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
236 hlist_add_head_rcu(&dev->index_hlist,
237 dev_index_hash(net, dev->ifindex));
ce286d32
EB
238 write_unlock_bh(&dev_base_lock);
239 return 0;
240}
241
fb699dfd
ED
242/* Device list removal
243 * caller must respect a RCU grace period before freeing/reusing dev
244 */
ce286d32
EB
245static void unlist_netdevice(struct net_device *dev)
246{
247 ASSERT_RTNL();
248
249 /* Unlink dev from the device chain */
250 write_lock_bh(&dev_base_lock);
c6d14c84 251 list_del_rcu(&dev->dev_list);
72c9528b 252 hlist_del_rcu(&dev->name_hlist);
fb699dfd 253 hlist_del_rcu(&dev->index_hlist);
ce286d32
EB
254 write_unlock_bh(&dev_base_lock);
255}
256
1da177e4
LT
257/*
258 * Our notifier list
259 */
260
f07d5b94 261static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
262
263/*
264 * Device drivers call our routines to queue packets here. We empty the
265 * queue in the local softnet handler.
266 */
bea3348e 267
9958da05 268DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 269EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 270
cf508b12 271#ifdef CONFIG_LOCKDEP
723e98b7 272/*
c773e847 273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
274 * according to dev->type
275 */
276static const unsigned short netdev_lock_type[] =
277 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
278 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
279 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
280 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
281 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
282 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
283 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
284 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
285 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
286 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
287 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
288 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
289 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
2d91d78b 290 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
929122cd 291 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
fcb94e42 292 ARPHRD_VOID, ARPHRD_NONE};
723e98b7 293
36cbd3dc 294static const char *const netdev_lock_name[] =
723e98b7
JP
295 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
2d91d78b 308 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
929122cd 309 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
fcb94e42 310 "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
311
312static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 313static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
314
315static inline unsigned short netdev_lock_pos(unsigned short dev_type)
316{
317 int i;
318
319 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
320 if (netdev_lock_type[i] == dev_type)
321 return i;
322 /* the last key is used by default */
323 return ARRAY_SIZE(netdev_lock_type) - 1;
324}
325
cf508b12
DM
326static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
327 unsigned short dev_type)
723e98b7
JP
328{
329 int i;
330
331 i = netdev_lock_pos(dev_type);
332 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
333 netdev_lock_name[i]);
334}
cf508b12
DM
335
336static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337{
338 int i;
339
340 i = netdev_lock_pos(dev->type);
341 lockdep_set_class_and_name(&dev->addr_list_lock,
342 &netdev_addr_lock_key[i],
343 netdev_lock_name[i]);
344}
723e98b7 345#else
cf508b12
DM
346static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
347 unsigned short dev_type)
348{
349}
350static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
351{
352}
353#endif
1da177e4
LT
354
355/*******************************************************************************
356
357 Protocol management and registration routines
358
359*******************************************************************************/
360
1da177e4
LT
361/*
362 * Add a protocol ID to the list. Now that the input handler is
363 * smarter we can dispense with all the messy stuff that used to be
364 * here.
365 *
366 * BEWARE!!! Protocol handlers, mangling input packets,
367 * MUST BE last in hash buckets and checking protocol handlers
368 * MUST start from promiscuous ptype_all chain in net_bh.
369 * It is true now, do not change it.
370 * Explanation follows: if protocol handler, mangling packet, will
371 * be the first on list, it is not able to sense, that packet
372 * is cloned and should be copied-on-write, so that it will
373 * change it and subsequent readers will get broken packet.
374 * --ANK (980803)
375 */
376
c07b68e8
ED
377static inline struct list_head *ptype_head(const struct packet_type *pt)
378{
379 if (pt->type == htons(ETH_P_ALL))
380 return &ptype_all;
381 else
382 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383}
384
1da177e4
LT
385/**
386 * dev_add_pack - add packet handler
387 * @pt: packet type declaration
388 *
389 * Add a protocol handler to the networking stack. The passed &packet_type
390 * is linked into kernel lists and may not be freed until it has been
391 * removed from the kernel lists.
392 *
4ec93edb 393 * This call does not sleep therefore it can not
1da177e4
LT
394 * guarantee all CPU's that are in middle of receiving packets
395 * will see the new packet type (until the next received packet).
396 */
397
398void dev_add_pack(struct packet_type *pt)
399{
c07b68e8 400 struct list_head *head = ptype_head(pt);
1da177e4 401
c07b68e8
ED
402 spin_lock(&ptype_lock);
403 list_add_rcu(&pt->list, head);
404 spin_unlock(&ptype_lock);
1da177e4 405}
d1b19dff 406EXPORT_SYMBOL(dev_add_pack);
1da177e4 407
1da177e4
LT
408/**
409 * __dev_remove_pack - remove packet handler
410 * @pt: packet type declaration
411 *
412 * Remove a protocol handler that was previously added to the kernel
413 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
414 * from the kernel lists and can be freed or reused once this function
4ec93edb 415 * returns.
1da177e4
LT
416 *
417 * The packet type might still be in use by receivers
418 * and must not be freed until after all the CPU's have gone
419 * through a quiescent state.
420 */
421void __dev_remove_pack(struct packet_type *pt)
422{
c07b68e8 423 struct list_head *head = ptype_head(pt);
1da177e4
LT
424 struct packet_type *pt1;
425
c07b68e8 426 spin_lock(&ptype_lock);
1da177e4
LT
427
428 list_for_each_entry(pt1, head, list) {
429 if (pt == pt1) {
430 list_del_rcu(&pt->list);
431 goto out;
432 }
433 }
434
435 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
436out:
c07b68e8 437 spin_unlock(&ptype_lock);
1da177e4 438}
d1b19dff
ED
439EXPORT_SYMBOL(__dev_remove_pack);
440
1da177e4
LT
441/**
442 * dev_remove_pack - remove packet handler
443 * @pt: packet type declaration
444 *
445 * Remove a protocol handler that was previously added to the kernel
446 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
447 * from the kernel lists and can be freed or reused once this function
448 * returns.
449 *
450 * This call sleeps to guarantee that no CPU is looking at the packet
451 * type after return.
452 */
453void dev_remove_pack(struct packet_type *pt)
454{
455 __dev_remove_pack(pt);
4ec93edb 456
1da177e4
LT
457 synchronize_net();
458}
d1b19dff 459EXPORT_SYMBOL(dev_remove_pack);
1da177e4
LT
460
461/******************************************************************************
462
463 Device Boot-time Settings Routines
464
465*******************************************************************************/
466
467/* Boot time configuration table */
468static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
469
470/**
471 * netdev_boot_setup_add - add new setup entry
472 * @name: name of the device
473 * @map: configured settings for the device
474 *
475 * Adds new setup entry to the dev_boot_setup list. The function
476 * returns 0 on error and 1 on success. This is a generic routine to
477 * all netdevices.
478 */
479static int netdev_boot_setup_add(char *name, struct ifmap *map)
480{
481 struct netdev_boot_setup *s;
482 int i;
483
484 s = dev_boot_setup;
485 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
486 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
487 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 488 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
489 memcpy(&s[i].map, map, sizeof(s[i].map));
490 break;
491 }
492 }
493
494 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
495}
496
497/**
498 * netdev_boot_setup_check - check boot time settings
499 * @dev: the netdevice
500 *
501 * Check boot time settings for the device.
502 * The found settings are set for the device to be used
503 * later in the device probing.
504 * Returns 0 if no settings found, 1 if they are.
505 */
506int netdev_boot_setup_check(struct net_device *dev)
507{
508 struct netdev_boot_setup *s = dev_boot_setup;
509 int i;
510
511 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
512 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 513 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
514 dev->irq = s[i].map.irq;
515 dev->base_addr = s[i].map.base_addr;
516 dev->mem_start = s[i].map.mem_start;
517 dev->mem_end = s[i].map.mem_end;
518 return 1;
519 }
520 }
521 return 0;
522}
d1b19dff 523EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
524
525
526/**
527 * netdev_boot_base - get address from boot time settings
528 * @prefix: prefix for network device
529 * @unit: id for network device
530 *
531 * Check boot time settings for the base address of device.
532 * The found settings are set for the device to be used
533 * later in the device probing.
534 * Returns 0 if no settings found.
535 */
536unsigned long netdev_boot_base(const char *prefix, int unit)
537{
538 const struct netdev_boot_setup *s = dev_boot_setup;
539 char name[IFNAMSIZ];
540 int i;
541
542 sprintf(name, "%s%d", prefix, unit);
543
544 /*
545 * If device already registered then return base of 1
546 * to indicate not to probe for this interface
547 */
881d966b 548 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
549 return 1;
550
551 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
552 if (!strcmp(name, s[i].name))
553 return s[i].map.base_addr;
554 return 0;
555}
556
557/*
558 * Saves at boot time configured settings for any netdevice.
559 */
560int __init netdev_boot_setup(char *str)
561{
562 int ints[5];
563 struct ifmap map;
564
565 str = get_options(str, ARRAY_SIZE(ints), ints);
566 if (!str || !*str)
567 return 0;
568
569 /* Save settings */
570 memset(&map, 0, sizeof(map));
571 if (ints[0] > 0)
572 map.irq = ints[1];
573 if (ints[0] > 1)
574 map.base_addr = ints[2];
575 if (ints[0] > 2)
576 map.mem_start = ints[3];
577 if (ints[0] > 3)
578 map.mem_end = ints[4];
579
580 /* Add new entry to the list */
581 return netdev_boot_setup_add(str, &map);
582}
583
584__setup("netdev=", netdev_boot_setup);
585
586/*******************************************************************************
587
588 Device Interface Subroutines
589
590*******************************************************************************/
591
592/**
593 * __dev_get_by_name - find a device by its name
c4ea43c5 594 * @net: the applicable net namespace
1da177e4
LT
595 * @name: name to find
596 *
597 * Find an interface by name. Must be called under RTNL semaphore
598 * or @dev_base_lock. If the name is found a pointer to the device
599 * is returned. If the name is not found then %NULL is returned. The
600 * reference counters are not incremented so the caller must be
601 * careful with locks.
602 */
603
881d966b 604struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
605{
606 struct hlist_node *p;
0bd8d536
ED
607 struct net_device *dev;
608 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 609
0bd8d536 610 hlist_for_each_entry(dev, p, head, name_hlist)
1da177e4
LT
611 if (!strncmp(dev->name, name, IFNAMSIZ))
612 return dev;
0bd8d536 613
1da177e4
LT
614 return NULL;
615}
d1b19dff 616EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 617
72c9528b
ED
618/**
619 * dev_get_by_name_rcu - find a device by its name
620 * @net: the applicable net namespace
621 * @name: name to find
622 *
623 * Find an interface by name.
624 * If the name is found a pointer to the device is returned.
625 * If the name is not found then %NULL is returned.
626 * The reference counters are not incremented so the caller must be
627 * careful with locks. The caller must hold RCU lock.
628 */
629
630struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
631{
632 struct hlist_node *p;
633 struct net_device *dev;
634 struct hlist_head *head = dev_name_hash(net, name);
635
636 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
637 if (!strncmp(dev->name, name, IFNAMSIZ))
638 return dev;
639
640 return NULL;
641}
642EXPORT_SYMBOL(dev_get_by_name_rcu);
643
1da177e4
LT
644/**
645 * dev_get_by_name - find a device by its name
c4ea43c5 646 * @net: the applicable net namespace
1da177e4
LT
647 * @name: name to find
648 *
649 * Find an interface by name. This can be called from any
650 * context and does its own locking. The returned handle has
651 * the usage count incremented and the caller must use dev_put() to
652 * release it when it is no longer needed. %NULL is returned if no
653 * matching device is found.
654 */
655
881d966b 656struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
657{
658 struct net_device *dev;
659
72c9528b
ED
660 rcu_read_lock();
661 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
662 if (dev)
663 dev_hold(dev);
72c9528b 664 rcu_read_unlock();
1da177e4
LT
665 return dev;
666}
d1b19dff 667EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
668
669/**
670 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 671 * @net: the applicable net namespace
1da177e4
LT
672 * @ifindex: index of device
673 *
674 * Search for an interface by index. Returns %NULL if the device
675 * is not found or a pointer to the device. The device has not
676 * had its reference counter increased so the caller must be careful
677 * about locking. The caller must hold either the RTNL semaphore
678 * or @dev_base_lock.
679 */
680
881d966b 681struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
682{
683 struct hlist_node *p;
0bd8d536
ED
684 struct net_device *dev;
685 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 686
0bd8d536 687 hlist_for_each_entry(dev, p, head, index_hlist)
1da177e4
LT
688 if (dev->ifindex == ifindex)
689 return dev;
0bd8d536 690
1da177e4
LT
691 return NULL;
692}
d1b19dff 693EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 694
fb699dfd
ED
695/**
696 * dev_get_by_index_rcu - find a device by its ifindex
697 * @net: the applicable net namespace
698 * @ifindex: index of device
699 *
700 * Search for an interface by index. Returns %NULL if the device
701 * is not found or a pointer to the device. The device has not
702 * had its reference counter increased so the caller must be careful
703 * about locking. The caller must hold RCU lock.
704 */
705
706struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
707{
708 struct hlist_node *p;
709 struct net_device *dev;
710 struct hlist_head *head = dev_index_hash(net, ifindex);
711
712 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
713 if (dev->ifindex == ifindex)
714 return dev;
715
716 return NULL;
717}
718EXPORT_SYMBOL(dev_get_by_index_rcu);
719
1da177e4
LT
720
721/**
722 * dev_get_by_index - find a device by its ifindex
c4ea43c5 723 * @net: the applicable net namespace
1da177e4
LT
724 * @ifindex: index of device
725 *
726 * Search for an interface by index. Returns NULL if the device
727 * is not found or a pointer to the device. The device returned has
728 * had a reference added and the pointer is safe until the user calls
729 * dev_put to indicate they have finished with it.
730 */
731
881d966b 732struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
733{
734 struct net_device *dev;
735
fb699dfd
ED
736 rcu_read_lock();
737 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
738 if (dev)
739 dev_hold(dev);
fb699dfd 740 rcu_read_unlock();
1da177e4
LT
741 return dev;
742}
d1b19dff 743EXPORT_SYMBOL(dev_get_by_index);
1da177e4
LT
744
745/**
746 * dev_getbyhwaddr - find a device by its hardware address
c4ea43c5 747 * @net: the applicable net namespace
1da177e4
LT
748 * @type: media type of device
749 * @ha: hardware address
750 *
751 * Search for an interface by MAC address. Returns NULL if the device
752 * is not found or a pointer to the device. The caller must hold the
753 * rtnl semaphore. The returned device has not had its ref count increased
754 * and the caller must therefore be careful about locking
755 *
756 * BUGS:
757 * If the API was consistent this would be __dev_get_by_hwaddr
758 */
759
881d966b 760struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
1da177e4
LT
761{
762 struct net_device *dev;
763
764 ASSERT_RTNL();
765
81103a52 766 for_each_netdev(net, dev)
1da177e4
LT
767 if (dev->type == type &&
768 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
769 return dev;
770
771 return NULL;
1da177e4 772}
cf309e3f
JF
773EXPORT_SYMBOL(dev_getbyhwaddr);
774
881d966b 775struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
776{
777 struct net_device *dev;
778
4e9cac2b 779 ASSERT_RTNL();
881d966b 780 for_each_netdev(net, dev)
4e9cac2b 781 if (dev->type == type)
7562f876
PE
782 return dev;
783
784 return NULL;
4e9cac2b 785}
4e9cac2b
PM
786EXPORT_SYMBOL(__dev_getfirstbyhwtype);
787
881d966b 788struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 789{
99fe3c39 790 struct net_device *dev, *ret = NULL;
4e9cac2b 791
99fe3c39
ED
792 rcu_read_lock();
793 for_each_netdev_rcu(net, dev)
794 if (dev->type == type) {
795 dev_hold(dev);
796 ret = dev;
797 break;
798 }
799 rcu_read_unlock();
800 return ret;
1da177e4 801}
1da177e4
LT
802EXPORT_SYMBOL(dev_getfirstbyhwtype);
803
804/**
bb69ae04 805 * dev_get_by_flags_rcu - find any device with given flags
c4ea43c5 806 * @net: the applicable net namespace
1da177e4
LT
807 * @if_flags: IFF_* values
808 * @mask: bitmask of bits in if_flags to check
809 *
810 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04
ED
811 * is not found or a pointer to the device. Must be called inside
812 * rcu_read_lock(), and result refcount is unchanged.
1da177e4
LT
813 */
814
bb69ae04 815struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
d1b19dff 816 unsigned short mask)
1da177e4 817{
7562f876 818 struct net_device *dev, *ret;
1da177e4 819
7562f876 820 ret = NULL;
c6d14c84 821 for_each_netdev_rcu(net, dev) {
1da177e4 822 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 823 ret = dev;
1da177e4
LT
824 break;
825 }
826 }
7562f876 827 return ret;
1da177e4 828}
bb69ae04 829EXPORT_SYMBOL(dev_get_by_flags_rcu);
1da177e4
LT
830
831/**
832 * dev_valid_name - check if name is okay for network device
833 * @name: name string
834 *
835 * Network device names need to be valid file names to
c7fa9d18
DM
836 * to allow sysfs to work. We also disallow any kind of
837 * whitespace.
1da177e4 838 */
c2373ee9 839int dev_valid_name(const char *name)
1da177e4 840{
c7fa9d18
DM
841 if (*name == '\0')
842 return 0;
b6fe17d6
SH
843 if (strlen(name) >= IFNAMSIZ)
844 return 0;
c7fa9d18
DM
845 if (!strcmp(name, ".") || !strcmp(name, ".."))
846 return 0;
847
848 while (*name) {
849 if (*name == '/' || isspace(*name))
850 return 0;
851 name++;
852 }
853 return 1;
1da177e4 854}
d1b19dff 855EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
856
857/**
b267b179
EB
858 * __dev_alloc_name - allocate a name for a device
859 * @net: network namespace to allocate the device name in
1da177e4 860 * @name: name format string
b267b179 861 * @buf: scratch buffer and result name string
1da177e4
LT
862 *
863 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
864 * id. It scans list of devices to build up a free map, then chooses
865 * the first empty slot. The caller must hold the dev_base or rtnl lock
866 * while allocating the name and adding the device in order to avoid
867 * duplicates.
868 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
869 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
870 */
871
b267b179 872static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
873{
874 int i = 0;
1da177e4
LT
875 const char *p;
876 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 877 unsigned long *inuse;
1da177e4
LT
878 struct net_device *d;
879
880 p = strnchr(name, IFNAMSIZ-1, '%');
881 if (p) {
882 /*
883 * Verify the string as this thing may have come from
884 * the user. There must be either one "%d" and no other "%"
885 * characters.
886 */
887 if (p[1] != 'd' || strchr(p + 2, '%'))
888 return -EINVAL;
889
890 /* Use one page as a bit array of possible slots */
cfcabdcc 891 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
892 if (!inuse)
893 return -ENOMEM;
894
881d966b 895 for_each_netdev(net, d) {
1da177e4
LT
896 if (!sscanf(d->name, name, &i))
897 continue;
898 if (i < 0 || i >= max_netdevices)
899 continue;
900
901 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 902 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
903 if (!strncmp(buf, d->name, IFNAMSIZ))
904 set_bit(i, inuse);
905 }
906
907 i = find_first_zero_bit(inuse, max_netdevices);
908 free_page((unsigned long) inuse);
909 }
910
d9031024
OP
911 if (buf != name)
912 snprintf(buf, IFNAMSIZ, name, i);
b267b179 913 if (!__dev_get_by_name(net, buf))
1da177e4 914 return i;
1da177e4
LT
915
916 /* It is possible to run out of possible slots
917 * when the name is long and there isn't enough space left
918 * for the digits, or if all bits are used.
919 */
920 return -ENFILE;
921}
922
b267b179
EB
923/**
924 * dev_alloc_name - allocate a name for a device
925 * @dev: device
926 * @name: name format string
927 *
928 * Passed a format string - eg "lt%d" it will try and find a suitable
929 * id. It scans list of devices to build up a free map, then chooses
930 * the first empty slot. The caller must hold the dev_base or rtnl lock
931 * while allocating the name and adding the device in order to avoid
932 * duplicates.
933 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
934 * Returns the number of the unit assigned or a negative errno code.
935 */
936
937int dev_alloc_name(struct net_device *dev, const char *name)
938{
939 char buf[IFNAMSIZ];
940 struct net *net;
941 int ret;
942
c346dca1
YH
943 BUG_ON(!dev_net(dev));
944 net = dev_net(dev);
b267b179
EB
945 ret = __dev_alloc_name(net, name, buf);
946 if (ret >= 0)
947 strlcpy(dev->name, buf, IFNAMSIZ);
948 return ret;
949}
d1b19dff 950EXPORT_SYMBOL(dev_alloc_name);
b267b179 951
8ce6cebc 952static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
d9031024 953{
8ce6cebc
DL
954 struct net *net;
955
956 BUG_ON(!dev_net(dev));
957 net = dev_net(dev);
958
d9031024
OP
959 if (!dev_valid_name(name))
960 return -EINVAL;
961
962 if (fmt && strchr(name, '%'))
8ce6cebc 963 return dev_alloc_name(dev, name);
d9031024
OP
964 else if (__dev_get_by_name(net, name))
965 return -EEXIST;
8ce6cebc
DL
966 else if (dev->name != name)
967 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
968
969 return 0;
970}
1da177e4
LT
971
972/**
973 * dev_change_name - change name of a device
974 * @dev: device
975 * @newname: name (or format string) must be at least IFNAMSIZ
976 *
977 * Change name of a device, can pass format strings "eth%d".
978 * for wildcarding.
979 */
cf04a4c7 980int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 981{
fcc5a03a 982 char oldname[IFNAMSIZ];
1da177e4 983 int err = 0;
fcc5a03a 984 int ret;
881d966b 985 struct net *net;
1da177e4
LT
986
987 ASSERT_RTNL();
c346dca1 988 BUG_ON(!dev_net(dev));
1da177e4 989
c346dca1 990 net = dev_net(dev);
1da177e4
LT
991 if (dev->flags & IFF_UP)
992 return -EBUSY;
993
c8d90dca
SH
994 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
995 return 0;
996
fcc5a03a
HX
997 memcpy(oldname, dev->name, IFNAMSIZ);
998
8ce6cebc 999 err = dev_get_valid_name(dev, newname, 1);
d9031024
OP
1000 if (err < 0)
1001 return err;
1da177e4 1002
fcc5a03a 1003rollback:
a1b3f594
EB
1004 ret = device_rename(&dev->dev, dev->name);
1005 if (ret) {
1006 memcpy(dev->name, oldname, IFNAMSIZ);
1007 return ret;
dcc99773 1008 }
7f988eab
HX
1009
1010 write_lock_bh(&dev_base_lock);
92749821 1011 hlist_del(&dev->name_hlist);
72c9528b
ED
1012 write_unlock_bh(&dev_base_lock);
1013
1014 synchronize_rcu();
1015
1016 write_lock_bh(&dev_base_lock);
1017 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1018 write_unlock_bh(&dev_base_lock);
1019
056925ab 1020 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1021 ret = notifier_to_errno(ret);
1022
1023 if (ret) {
91e9c07b
ED
1024 /* err >= 0 after dev_alloc_name() or stores the first errno */
1025 if (err >= 0) {
fcc5a03a
HX
1026 err = ret;
1027 memcpy(dev->name, oldname, IFNAMSIZ);
1028 goto rollback;
91e9c07b
ED
1029 } else {
1030 printk(KERN_ERR
1031 "%s: name change rollback failed: %d.\n",
1032 dev->name, ret);
fcc5a03a
HX
1033 }
1034 }
1da177e4
LT
1035
1036 return err;
1037}
1038
0b815a1a
SH
1039/**
1040 * dev_set_alias - change ifalias of a device
1041 * @dev: device
1042 * @alias: name up to IFALIASZ
f0db275a 1043 * @len: limit of bytes to copy from info
0b815a1a
SH
1044 *
1045 * Set ifalias for a device,
1046 */
1047int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1048{
1049 ASSERT_RTNL();
1050
1051 if (len >= IFALIASZ)
1052 return -EINVAL;
1053
96ca4a2c
OH
1054 if (!len) {
1055 if (dev->ifalias) {
1056 kfree(dev->ifalias);
1057 dev->ifalias = NULL;
1058 }
1059 return 0;
1060 }
1061
d1b19dff 1062 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
0b815a1a
SH
1063 if (!dev->ifalias)
1064 return -ENOMEM;
1065
1066 strlcpy(dev->ifalias, alias, len+1);
1067 return len;
1068}
1069
1070
d8a33ac4 1071/**
3041a069 1072 * netdev_features_change - device changes features
d8a33ac4
SH
1073 * @dev: device to cause notification
1074 *
1075 * Called to indicate a device has changed features.
1076 */
1077void netdev_features_change(struct net_device *dev)
1078{
056925ab 1079 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1080}
1081EXPORT_SYMBOL(netdev_features_change);
1082
1da177e4
LT
1083/**
1084 * netdev_state_change - device changes state
1085 * @dev: device to cause notification
1086 *
1087 * Called to indicate a device has changed state. This function calls
1088 * the notifier chains for netdev_chain and sends a NEWLINK message
1089 * to the routing socket.
1090 */
1091void netdev_state_change(struct net_device *dev)
1092{
1093 if (dev->flags & IFF_UP) {
056925ab 1094 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1da177e4
LT
1095 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1096 }
1097}
d1b19dff 1098EXPORT_SYMBOL(netdev_state_change);
1da177e4 1099
3ca5b404 1100int netdev_bonding_change(struct net_device *dev, unsigned long event)
c1da4ac7 1101{
3ca5b404 1102 return call_netdevice_notifiers(event, dev);
c1da4ac7
OG
1103}
1104EXPORT_SYMBOL(netdev_bonding_change);
1105
1da177e4
LT
1106/**
1107 * dev_load - load a network module
c4ea43c5 1108 * @net: the applicable net namespace
1da177e4
LT
1109 * @name: name of interface
1110 *
1111 * If a network interface is not present and the process has suitable
1112 * privileges this function loads the module. If module loading is not
1113 * available in this kernel then it becomes a nop.
1114 */
1115
881d966b 1116void dev_load(struct net *net, const char *name)
1da177e4 1117{
4ec93edb 1118 struct net_device *dev;
1da177e4 1119
72c9528b
ED
1120 rcu_read_lock();
1121 dev = dev_get_by_name_rcu(net, name);
1122 rcu_read_unlock();
1da177e4 1123
a8f80e8f 1124 if (!dev && capable(CAP_NET_ADMIN))
1da177e4
LT
1125 request_module("%s", name);
1126}
d1b19dff 1127EXPORT_SYMBOL(dev_load);
1da177e4 1128
bd380811 1129static int __dev_open(struct net_device *dev)
1da177e4 1130{
d314774c 1131 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1132 int ret;
1da177e4 1133
e46b66bc
BH
1134 ASSERT_RTNL();
1135
1da177e4
LT
1136 /*
1137 * Is it even present?
1138 */
1139 if (!netif_device_present(dev))
1140 return -ENODEV;
1141
3b8bcfd5
JB
1142 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1143 ret = notifier_to_errno(ret);
1144 if (ret)
1145 return ret;
1146
1da177e4
LT
1147 /*
1148 * Call device private open method
1149 */
1150 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1151
d314774c
SH
1152 if (ops->ndo_validate_addr)
1153 ret = ops->ndo_validate_addr(dev);
bada339b 1154
d314774c
SH
1155 if (!ret && ops->ndo_open)
1156 ret = ops->ndo_open(dev);
1da177e4 1157
4ec93edb 1158 /*
1da177e4
LT
1159 * If it went open OK then:
1160 */
1161
bada339b
JG
1162 if (ret)
1163 clear_bit(__LINK_STATE_START, &dev->state);
1164 else {
1da177e4
LT
1165 /*
1166 * Set the flags.
1167 */
1168 dev->flags |= IFF_UP;
1169
649274d9
DW
1170 /*
1171 * Enable NET_DMA
1172 */
b4bd07c2 1173 net_dmaengine_get();
649274d9 1174
1da177e4
LT
1175 /*
1176 * Initialize multicasting status
1177 */
4417da66 1178 dev_set_rx_mode(dev);
1da177e4
LT
1179
1180 /*
1181 * Wakeup transmit queue engine
1182 */
1183 dev_activate(dev);
1da177e4 1184 }
bada339b 1185
1da177e4
LT
1186 return ret;
1187}
1188
1189/**
bd380811
PM
1190 * dev_open - prepare an interface for use.
1191 * @dev: device to open
1da177e4 1192 *
bd380811
PM
1193 * Takes a device from down to up state. The device's private open
1194 * function is invoked and then the multicast lists are loaded. Finally
1195 * the device is moved into the up state and a %NETDEV_UP message is
1196 * sent to the netdev notifier chain.
1197 *
1198 * Calling this function on an active interface is a nop. On a failure
1199 * a negative errno code is returned.
1da177e4 1200 */
bd380811
PM
1201int dev_open(struct net_device *dev)
1202{
1203 int ret;
1204
1205 /*
1206 * Is it already up?
1207 */
1208 if (dev->flags & IFF_UP)
1209 return 0;
1210
1211 /*
1212 * Open device
1213 */
1214 ret = __dev_open(dev);
1215 if (ret < 0)
1216 return ret;
1217
1218 /*
1219 * ... and announce new interface.
1220 */
1221 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1222 call_netdevice_notifiers(NETDEV_UP, dev);
1223
1224 return ret;
1225}
1226EXPORT_SYMBOL(dev_open);
1227
1228static int __dev_close(struct net_device *dev)
1da177e4 1229{
d314774c 1230 const struct net_device_ops *ops = dev->netdev_ops;
e46b66bc 1231
bd380811 1232 ASSERT_RTNL();
9d5010db
DM
1233 might_sleep();
1234
1da177e4
LT
1235 /*
1236 * Tell people we are going down, so that they can
1237 * prepare to death, when device is still operating.
1238 */
056925ab 1239 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1240
1da177e4
LT
1241 clear_bit(__LINK_STATE_START, &dev->state);
1242
1243 /* Synchronize to scheduled poll. We cannot touch poll list,
bea3348e
SH
1244 * it can be even on different cpu. So just clear netif_running().
1245 *
1246 * dev->stop() will invoke napi_disable() on all of it's
1247 * napi_struct instances on this device.
1248 */
1da177e4 1249 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1da177e4 1250
d8b2a4d2
ML
1251 dev_deactivate(dev);
1252
1da177e4
LT
1253 /*
1254 * Call the device specific close. This cannot fail.
1255 * Only if device is UP
1256 *
1257 * We allow it to be called even after a DETACH hot-plug
1258 * event.
1259 */
d314774c
SH
1260 if (ops->ndo_stop)
1261 ops->ndo_stop(dev);
1da177e4
LT
1262
1263 /*
1264 * Device is now down.
1265 */
1266
1267 dev->flags &= ~IFF_UP;
1268
1269 /*
bd380811 1270 * Shutdown NET_DMA
1da177e4 1271 */
bd380811
PM
1272 net_dmaengine_put();
1273
1274 return 0;
1275}
1276
1277/**
1278 * dev_close - shutdown an interface.
1279 * @dev: device to shutdown
1280 *
1281 * This function moves an active device into down state. A
1282 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1283 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1284 * chain.
1285 */
1286int dev_close(struct net_device *dev)
1287{
1288 if (!(dev->flags & IFF_UP))
1289 return 0;
1290
1291 __dev_close(dev);
1da177e4 1292
649274d9 1293 /*
bd380811 1294 * Tell people we are down
649274d9 1295 */
bd380811
PM
1296 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1297 call_netdevice_notifiers(NETDEV_DOWN, dev);
649274d9 1298
1da177e4
LT
1299 return 0;
1300}
d1b19dff 1301EXPORT_SYMBOL(dev_close);
1da177e4
LT
1302
1303
0187bdfb
BH
1304/**
1305 * dev_disable_lro - disable Large Receive Offload on a device
1306 * @dev: device
1307 *
1308 * Disable Large Receive Offload (LRO) on a net device. Must be
1309 * called under RTNL. This is needed if received packets may be
1310 * forwarded to another interface.
1311 */
1312void dev_disable_lro(struct net_device *dev)
1313{
1314 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1315 dev->ethtool_ops->set_flags) {
1316 u32 flags = dev->ethtool_ops->get_flags(dev);
1317 if (flags & ETH_FLAG_LRO) {
1318 flags &= ~ETH_FLAG_LRO;
1319 dev->ethtool_ops->set_flags(dev, flags);
1320 }
1321 }
1322 WARN_ON(dev->features & NETIF_F_LRO);
1323}
1324EXPORT_SYMBOL(dev_disable_lro);
1325
1326
881d966b
EB
1327static int dev_boot_phase = 1;
1328
1da177e4
LT
1329/*
1330 * Device change register/unregister. These are not inline or static
1331 * as we export them to the world.
1332 */
1333
1334/**
1335 * register_netdevice_notifier - register a network notifier block
1336 * @nb: notifier
1337 *
1338 * Register a notifier to be called when network device events occur.
1339 * The notifier passed is linked into the kernel structures and must
1340 * not be reused until it has been unregistered. A negative errno code
1341 * is returned on a failure.
1342 *
1343 * When registered all registration and up events are replayed
4ec93edb 1344 * to the new notifier to allow device to have a race free
1da177e4
LT
1345 * view of the network device list.
1346 */
1347
1348int register_netdevice_notifier(struct notifier_block *nb)
1349{
1350 struct net_device *dev;
fcc5a03a 1351 struct net_device *last;
881d966b 1352 struct net *net;
1da177e4
LT
1353 int err;
1354
1355 rtnl_lock();
f07d5b94 1356 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1357 if (err)
1358 goto unlock;
881d966b
EB
1359 if (dev_boot_phase)
1360 goto unlock;
1361 for_each_net(net) {
1362 for_each_netdev(net, dev) {
1363 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1364 err = notifier_to_errno(err);
1365 if (err)
1366 goto rollback;
1367
1368 if (!(dev->flags & IFF_UP))
1369 continue;
1da177e4 1370
881d966b
EB
1371 nb->notifier_call(nb, NETDEV_UP, dev);
1372 }
1da177e4 1373 }
fcc5a03a
HX
1374
1375unlock:
1da177e4
LT
1376 rtnl_unlock();
1377 return err;
fcc5a03a
HX
1378
1379rollback:
1380 last = dev;
881d966b
EB
1381 for_each_net(net) {
1382 for_each_netdev(net, dev) {
1383 if (dev == last)
1384 break;
fcc5a03a 1385
881d966b
EB
1386 if (dev->flags & IFF_UP) {
1387 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1388 nb->notifier_call(nb, NETDEV_DOWN, dev);
1389 }
1390 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
a5ee1551 1391 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
fcc5a03a 1392 }
fcc5a03a 1393 }
c67625a1
PE
1394
1395 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1396 goto unlock;
1da177e4 1397}
d1b19dff 1398EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1399
1400/**
1401 * unregister_netdevice_notifier - unregister a network notifier block
1402 * @nb: notifier
1403 *
1404 * Unregister a notifier previously registered by
1405 * register_netdevice_notifier(). The notifier is unlinked into the
1406 * kernel structures and may then be reused. A negative errno code
1407 * is returned on a failure.
1408 */
1409
1410int unregister_netdevice_notifier(struct notifier_block *nb)
1411{
9f514950
HX
1412 int err;
1413
1414 rtnl_lock();
f07d5b94 1415 err = raw_notifier_chain_unregister(&netdev_chain, nb);
9f514950
HX
1416 rtnl_unlock();
1417 return err;
1da177e4 1418}
d1b19dff 1419EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4
LT
1420
1421/**
1422 * call_netdevice_notifiers - call all network notifier blocks
1423 * @val: value passed unmodified to notifier function
c4ea43c5 1424 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1425 *
1426 * Call all network notifier blocks. Parameters and return value
f07d5b94 1427 * are as for raw_notifier_call_chain().
1da177e4
LT
1428 */
1429
ad7379d4 1430int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1431{
ab930471 1432 ASSERT_RTNL();
ad7379d4 1433 return raw_notifier_call_chain(&netdev_chain, val, dev);
1da177e4
LT
1434}
1435
1436/* When > 0 there are consumers of rx skb time stamps */
1437static atomic_t netstamp_needed = ATOMIC_INIT(0);
1438
1439void net_enable_timestamp(void)
1440{
1441 atomic_inc(&netstamp_needed);
1442}
d1b19dff 1443EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1444
1445void net_disable_timestamp(void)
1446{
1447 atomic_dec(&netstamp_needed);
1448}
d1b19dff 1449EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1450
3b098e2d 1451static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4
LT
1452{
1453 if (atomic_read(&netstamp_needed))
a61bbcf2 1454 __net_timestamp(skb);
b7aa0bf7
ED
1455 else
1456 skb->tstamp.tv64 = 0;
1da177e4
LT
1457}
1458
3b098e2d
ED
1459static inline void net_timestamp_check(struct sk_buff *skb)
1460{
1461 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1462 __net_timestamp(skb);
1463}
1464
44540960
AB
1465/**
1466 * dev_forward_skb - loopback an skb to another netif
1467 *
1468 * @dev: destination network device
1469 * @skb: buffer to forward
1470 *
1471 * return values:
1472 * NET_RX_SUCCESS (no congestion)
6ec82562 1473 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1474 *
1475 * dev_forward_skb can be used for injecting an skb from the
1476 * start_xmit function of one device into the receive queue
1477 * of another device.
1478 *
1479 * The receiving device may be in another namespace, so
1480 * we have to clear all information in the skb that could
1481 * impact namespace isolation.
1482 */
1483int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1484{
1485 skb_orphan(skb);
c736eefa 1486 nf_reset(skb);
44540960 1487
caf586e5 1488 if (unlikely(!(dev->flags & IFF_UP) ||
2198a10b 1489 (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
caf586e5 1490 atomic_long_inc(&dev->rx_dropped);
6ec82562 1491 kfree_skb(skb);
44540960 1492 return NET_RX_DROP;
6ec82562 1493 }
8a83a00b 1494 skb_set_dev(skb, dev);
44540960
AB
1495 skb->tstamp.tv64 = 0;
1496 skb->pkt_type = PACKET_HOST;
1497 skb->protocol = eth_type_trans(skb, dev);
44540960
AB
1498 return netif_rx(skb);
1499}
1500EXPORT_SYMBOL_GPL(dev_forward_skb);
1501
1da177e4
LT
1502/*
1503 * Support routine. Sends outgoing frames to any network
1504 * taps currently in use.
1505 */
1506
f6a78bfc 1507static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1508{
1509 struct packet_type *ptype;
a61bbcf2 1510
8caf1539
JP
1511#ifdef CONFIG_NET_CLS_ACT
1512 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
3b098e2d 1513 net_timestamp_set(skb);
8caf1539 1514#else
3b098e2d 1515 net_timestamp_set(skb);
8caf1539 1516#endif
1da177e4
LT
1517
1518 rcu_read_lock();
1519 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1520 /* Never send packets back to the socket
1521 * they originated from - MvS (miquels@drinkel.ow.org)
1522 */
1523 if ((ptype->dev == dev || !ptype->dev) &&
1524 (ptype->af_packet_priv == NULL ||
1525 (struct sock *)ptype->af_packet_priv != skb->sk)) {
d1b19dff 1526 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1da177e4
LT
1527 if (!skb2)
1528 break;
1529
1530 /* skb->nh should be correctly
1531 set by sender, so that the second statement is
1532 just protection against buggy protocols.
1533 */
459a98ed 1534 skb_reset_mac_header(skb2);
1da177e4 1535
d56f90a7 1536 if (skb_network_header(skb2) < skb2->data ||
27a884dc 1537 skb2->network_header > skb2->tail) {
1da177e4
LT
1538 if (net_ratelimit())
1539 printk(KERN_CRIT "protocol %04x is "
1540 "buggy, dev %s\n",
70777d03
SAS
1541 ntohs(skb2->protocol),
1542 dev->name);
c1d2bbe1 1543 skb_reset_network_header(skb2);
1da177e4
LT
1544 }
1545
b0e380b1 1546 skb2->transport_header = skb2->network_header;
1da177e4 1547 skb2->pkt_type = PACKET_OUTGOING;
f2ccd8fa 1548 ptype->func(skb2, skb->dev, ptype, skb->dev);
1da177e4
LT
1549 }
1550 }
1551 rcu_read_unlock();
1552}
1553
f0796d5c
JF
1554/*
1555 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1556 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1557 */
e6484930 1558int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
f0796d5c 1559{
e6484930
TH
1560 if (txq < 1 || txq > dev->num_tx_queues)
1561 return -EINVAL;
f0796d5c 1562
e6484930
TH
1563 if (dev->reg_state == NETREG_REGISTERED) {
1564 ASSERT_RTNL();
1565
1566 if (txq < dev->real_num_tx_queues)
1567 qdisc_reset_all_tx_gt(dev, txq);
f0796d5c 1568 }
e6484930
TH
1569
1570 dev->real_num_tx_queues = txq;
1571 return 0;
f0796d5c
JF
1572}
1573EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 1574
62fe0b40
BH
1575#ifdef CONFIG_RPS
1576/**
1577 * netif_set_real_num_rx_queues - set actual number of RX queues used
1578 * @dev: Network device
1579 * @rxq: Actual number of RX queues
1580 *
1581 * This must be called either with the rtnl_lock held or before
1582 * registration of the net device. Returns 0 on success, or a
4e7f7951
BH
1583 * negative error code. If called before registration, it always
1584 * succeeds.
62fe0b40
BH
1585 */
1586int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1587{
1588 int rc;
1589
bd25fa7b
TH
1590 if (rxq < 1 || rxq > dev->num_rx_queues)
1591 return -EINVAL;
1592
62fe0b40
BH
1593 if (dev->reg_state == NETREG_REGISTERED) {
1594 ASSERT_RTNL();
1595
62fe0b40
BH
1596 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1597 rxq);
1598 if (rc)
1599 return rc;
62fe0b40
BH
1600 }
1601
1602 dev->real_num_rx_queues = rxq;
1603 return 0;
1604}
1605EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1606#endif
1607
def82a1d 1608static inline void __netif_reschedule(struct Qdisc *q)
56079431 1609{
def82a1d
JP
1610 struct softnet_data *sd;
1611 unsigned long flags;
56079431 1612
def82a1d
JP
1613 local_irq_save(flags);
1614 sd = &__get_cpu_var(softnet_data);
a9cbd588
CG
1615 q->next_sched = NULL;
1616 *sd->output_queue_tailp = q;
1617 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
1618 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1619 local_irq_restore(flags);
1620}
1621
1622void __netif_schedule(struct Qdisc *q)
1623{
1624 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1625 __netif_reschedule(q);
56079431
DV
1626}
1627EXPORT_SYMBOL(__netif_schedule);
1628
bea3348e 1629void dev_kfree_skb_irq(struct sk_buff *skb)
56079431 1630{
3578b0c8 1631 if (atomic_dec_and_test(&skb->users)) {
bea3348e
SH
1632 struct softnet_data *sd;
1633 unsigned long flags;
56079431 1634
bea3348e
SH
1635 local_irq_save(flags);
1636 sd = &__get_cpu_var(softnet_data);
1637 skb->next = sd->completion_queue;
1638 sd->completion_queue = skb;
1639 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1640 local_irq_restore(flags);
1641 }
56079431 1642}
bea3348e 1643EXPORT_SYMBOL(dev_kfree_skb_irq);
56079431
DV
1644
1645void dev_kfree_skb_any(struct sk_buff *skb)
1646{
1647 if (in_irq() || irqs_disabled())
1648 dev_kfree_skb_irq(skb);
1649 else
1650 dev_kfree_skb(skb);
1651}
1652EXPORT_SYMBOL(dev_kfree_skb_any);
1653
1654
bea3348e
SH
1655/**
1656 * netif_device_detach - mark device as removed
1657 * @dev: network device
1658 *
1659 * Mark device as removed from system and therefore no longer available.
1660 */
56079431
DV
1661void netif_device_detach(struct net_device *dev)
1662{
1663 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1664 netif_running(dev)) {
d543103a 1665 netif_tx_stop_all_queues(dev);
56079431
DV
1666 }
1667}
1668EXPORT_SYMBOL(netif_device_detach);
1669
bea3348e
SH
1670/**
1671 * netif_device_attach - mark device as attached
1672 * @dev: network device
1673 *
1674 * Mark device as attached from system and restart if needed.
1675 */
56079431
DV
1676void netif_device_attach(struct net_device *dev)
1677{
1678 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1679 netif_running(dev)) {
d543103a 1680 netif_tx_wake_all_queues(dev);
4ec93edb 1681 __netdev_watchdog_up(dev);
56079431
DV
1682 }
1683}
1684EXPORT_SYMBOL(netif_device_attach);
1685
6de329e2
BH
1686static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1687{
1688 return ((features & NETIF_F_GEN_CSUM) ||
1689 ((features & NETIF_F_IP_CSUM) &&
1690 protocol == htons(ETH_P_IP)) ||
1691 ((features & NETIF_F_IPV6_CSUM) &&
1c8dbcf6
YZ
1692 protocol == htons(ETH_P_IPV6)) ||
1693 ((features & NETIF_F_FCOE_CRC) &&
1694 protocol == htons(ETH_P_FCOE)));
6de329e2
BH
1695}
1696
1697static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1698{
7b9c6090
JG
1699 int features = dev->features;
1700
1701 if (vlan_tx_tag_present(skb))
1702 features &= dev->vlan_features;
1703
1704 if (can_checksum_protocol(features, skb->protocol))
6de329e2
BH
1705 return true;
1706
1707 if (skb->protocol == htons(ETH_P_8021Q)) {
1708 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1709 if (can_checksum_protocol(dev->features & dev->vlan_features,
1710 veh->h_vlan_encapsulated_proto))
1711 return true;
1712 }
1713
1714 return false;
1715}
56079431 1716
8a83a00b
AB
1717/**
1718 * skb_dev_set -- assign a new device to a buffer
1719 * @skb: buffer for the new device
1720 * @dev: network device
1721 *
1722 * If an skb is owned by a device already, we have to reset
1723 * all data private to the namespace a device belongs to
1724 * before assigning it a new device.
1725 */
1726#ifdef CONFIG_NET_NS
1727void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1728{
1729 skb_dst_drop(skb);
1730 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1731 secpath_reset(skb);
1732 nf_reset(skb);
1733 skb_init_secmark(skb);
1734 skb->mark = 0;
1735 skb->priority = 0;
1736 skb->nf_trace = 0;
1737 skb->ipvs_property = 0;
1738#ifdef CONFIG_NET_SCHED
1739 skb->tc_index = 0;
1740#endif
1741 }
1742 skb->dev = dev;
1743}
1744EXPORT_SYMBOL(skb_set_dev);
1745#endif /* CONFIG_NET_NS */
1746
1da177e4
LT
1747/*
1748 * Invalidate hardware checksum when packet is to be mangled, and
1749 * complete checksum manually on outgoing path.
1750 */
84fa7933 1751int skb_checksum_help(struct sk_buff *skb)
1da177e4 1752{
d3bc23e7 1753 __wsum csum;
663ead3b 1754 int ret = 0, offset;
1da177e4 1755
84fa7933 1756 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
1757 goto out_set_summed;
1758
1759 if (unlikely(skb_shinfo(skb)->gso_size)) {
a430a43d
HX
1760 /* Let GSO fix up the checksum. */
1761 goto out_set_summed;
1da177e4
LT
1762 }
1763
a030847e
HX
1764 offset = skb->csum_start - skb_headroom(skb);
1765 BUG_ON(offset >= skb_headlen(skb));
1766 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1767
1768 offset += skb->csum_offset;
1769 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1770
1771 if (skb_cloned(skb) &&
1772 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
1773 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1774 if (ret)
1775 goto out;
1776 }
1777
a030847e 1778 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 1779out_set_summed:
1da177e4 1780 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 1781out:
1da177e4
LT
1782 return ret;
1783}
d1b19dff 1784EXPORT_SYMBOL(skb_checksum_help);
1da177e4 1785
f6a78bfc
HX
1786/**
1787 * skb_gso_segment - Perform segmentation on skb.
1788 * @skb: buffer to segment
576a30eb 1789 * @features: features for the output path (see dev->features)
f6a78bfc
HX
1790 *
1791 * This function segments the given skb and returns a list of segments.
576a30eb
HX
1792 *
1793 * It may return NULL if the skb requires no segmentation. This is
1794 * only possible when GSO is used for verifying header integrity.
f6a78bfc 1795 */
576a30eb 1796struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
f6a78bfc
HX
1797{
1798 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1799 struct packet_type *ptype;
252e3346 1800 __be16 type = skb->protocol;
a430a43d 1801 int err;
f6a78bfc 1802
7b9c6090
JG
1803 if (type == htons(ETH_P_8021Q)) {
1804 struct vlan_ethhdr *veh;
1805
1806 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
1807 return ERR_PTR(-EINVAL);
1808
1809 veh = (struct vlan_ethhdr *)skb->data;
1810 type = veh->h_vlan_encapsulated_proto;
1811 }
1812
459a98ed 1813 skb_reset_mac_header(skb);
b0e380b1 1814 skb->mac_len = skb->network_header - skb->mac_header;
f6a78bfc
HX
1815 __skb_pull(skb, skb->mac_len);
1816
67fd1a73
HX
1817 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1818 struct net_device *dev = skb->dev;
1819 struct ethtool_drvinfo info = {};
1820
1821 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1822 dev->ethtool_ops->get_drvinfo(dev, &info);
1823
1824 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1825 "ip_summed=%d",
1826 info.driver, dev ? dev->features : 0L,
1827 skb->sk ? skb->sk->sk_route_caps : 0L,
1828 skb->len, skb->data_len, skb->ip_summed);
1829
a430a43d
HX
1830 if (skb_header_cloned(skb) &&
1831 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1832 return ERR_PTR(err);
1833 }
1834
f6a78bfc 1835 rcu_read_lock();
82d8a867
PE
1836 list_for_each_entry_rcu(ptype,
1837 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
f6a78bfc 1838 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
84fa7933 1839 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
a430a43d
HX
1840 err = ptype->gso_send_check(skb);
1841 segs = ERR_PTR(err);
1842 if (err || skb_gso_ok(skb, features))
1843 break;
d56f90a7
ACM
1844 __skb_push(skb, (skb->data -
1845 skb_network_header(skb)));
a430a43d 1846 }
576a30eb 1847 segs = ptype->gso_segment(skb, features);
f6a78bfc
HX
1848 break;
1849 }
1850 }
1851 rcu_read_unlock();
1852
98e399f8 1853 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 1854
f6a78bfc
HX
1855 return segs;
1856}
f6a78bfc
HX
1857EXPORT_SYMBOL(skb_gso_segment);
1858
fb286bb2
HX
1859/* Take action when hardware reception checksum errors are detected. */
1860#ifdef CONFIG_BUG
1861void netdev_rx_csum_fault(struct net_device *dev)
1862{
1863 if (net_ratelimit()) {
4ec93edb 1864 printk(KERN_ERR "%s: hw csum failure.\n",
246a4212 1865 dev ? dev->name : "<unknown>");
fb286bb2
HX
1866 dump_stack();
1867 }
1868}
1869EXPORT_SYMBOL(netdev_rx_csum_fault);
1870#endif
1871
1da177e4
LT
1872/* Actually, we should eliminate this check as soon as we know, that:
1873 * 1. IOMMU is present and allows to map all the memory.
1874 * 2. No high memory really exists on this machine.
1875 */
1876
9092c658 1877static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 1878{
3d3a8533 1879#ifdef CONFIG_HIGHMEM
1da177e4 1880 int i;
5acbbd42
FT
1881 if (!(dev->features & NETIF_F_HIGHDMA)) {
1882 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1883 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1884 return 1;
1885 }
1da177e4 1886
5acbbd42
FT
1887 if (PCI_DMA_BUS_IS_PHYS) {
1888 struct device *pdev = dev->dev.parent;
1da177e4 1889
9092c658
ED
1890 if (!pdev)
1891 return 0;
5acbbd42
FT
1892 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1893 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1894 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1895 return 1;
1896 }
1897 }
3d3a8533 1898#endif
1da177e4
LT
1899 return 0;
1900}
1da177e4 1901
f6a78bfc
HX
1902struct dev_gso_cb {
1903 void (*destructor)(struct sk_buff *skb);
1904};
1905
1906#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1907
1908static void dev_gso_skb_destructor(struct sk_buff *skb)
1909{
1910 struct dev_gso_cb *cb;
1911
1912 do {
1913 struct sk_buff *nskb = skb->next;
1914
1915 skb->next = nskb->next;
1916 nskb->next = NULL;
1917 kfree_skb(nskb);
1918 } while (skb->next);
1919
1920 cb = DEV_GSO_CB(skb);
1921 if (cb->destructor)
1922 cb->destructor(skb);
1923}
1924
1925/**
1926 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1927 * @skb: buffer to segment
1928 *
1929 * This function segments the given skb and stores the list of segments
1930 * in skb->next.
1931 */
1932static int dev_gso_segment(struct sk_buff *skb)
1933{
1934 struct net_device *dev = skb->dev;
1935 struct sk_buff *segs;
576a30eb
HX
1936 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1937 NETIF_F_SG : 0);
1938
1939 segs = skb_gso_segment(skb, features);
1940
1941 /* Verifying header integrity only. */
1942 if (!segs)
1943 return 0;
f6a78bfc 1944
801678c5 1945 if (IS_ERR(segs))
f6a78bfc
HX
1946 return PTR_ERR(segs);
1947
1948 skb->next = segs;
1949 DEV_GSO_CB(skb)->destructor = skb->destructor;
1950 skb->destructor = dev_gso_skb_destructor;
1951
1952 return 0;
1953}
1954
fc6055a5
ED
1955/*
1956 * Try to orphan skb early, right before transmission by the device.
2244d07b
OH
1957 * We cannot orphan skb if tx timestamp is requested or the sk-reference
1958 * is needed on driver level for other reasons, e.g. see net/can/raw.c
fc6055a5
ED
1959 */
1960static inline void skb_orphan_try(struct sk_buff *skb)
1961{
87fd308c
ED
1962 struct sock *sk = skb->sk;
1963
2244d07b 1964 if (sk && !skb_shinfo(skb)->tx_flags) {
87fd308c
ED
1965 /* skb_tx_hash() wont be able to get sk.
1966 * We copy sk_hash into skb->rxhash
1967 */
1968 if (!skb->rxhash)
1969 skb->rxhash = sk->sk_hash;
fc6055a5 1970 skb_orphan(skb);
87fd308c 1971 }
fc6055a5
ED
1972}
1973
6afff0ca
JF
1974/*
1975 * Returns true if either:
1976 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
1977 * 2. skb is fragmented and the device does not support SG, or if
1978 * at least one of fragments is in highmem and device does not
1979 * support DMA from it.
1980 */
1981static inline int skb_needs_linearize(struct sk_buff *skb,
1982 struct net_device *dev)
1983{
7b9c6090
JG
1984 int features = dev->features;
1985
1986 if (skb->protocol == htons(ETH_P_8021Q) || vlan_tx_tag_present(skb))
1987 features &= dev->vlan_features;
1988
6afff0ca 1989 return skb_is_nonlinear(skb) &&
7b9c6090
JG
1990 ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
1991 (skb_shinfo(skb)->nr_frags && (!(features & NETIF_F_SG) ||
6afff0ca
JF
1992 illegal_highdma(dev, skb))));
1993}
1994
fd2ea0a7
DM
1995int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1996 struct netdev_queue *txq)
f6a78bfc 1997{
00829823 1998 const struct net_device_ops *ops = dev->netdev_ops;
572a9d7b 1999 int rc = NETDEV_TX_OK;
00829823 2000
f6a78bfc 2001 if (likely(!skb->next)) {
9be9a6b9 2002 if (!list_empty(&ptype_all))
f6a78bfc
HX
2003 dev_queue_xmit_nit(skb, dev);
2004
93f154b5
ED
2005 /*
2006 * If device doesnt need skb->dst, release it right now while
2007 * its hot in this cpu cache
2008 */
adf30907
ED
2009 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2010 skb_dst_drop(skb);
2011
fc6055a5 2012 skb_orphan_try(skb);
9ccb8975 2013
7b9c6090
JG
2014 if (vlan_tx_tag_present(skb) &&
2015 !(dev->features & NETIF_F_HW_VLAN_TX)) {
2016 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2017 if (unlikely(!skb))
2018 goto out;
2019
2020 skb->vlan_tci = 0;
2021 }
2022
9ccb8975
DM
2023 if (netif_needs_gso(dev, skb)) {
2024 if (unlikely(dev_gso_segment(skb)))
2025 goto out_kfree_skb;
2026 if (skb->next)
2027 goto gso;
6afff0ca
JF
2028 } else {
2029 if (skb_needs_linearize(skb, dev) &&
2030 __skb_linearize(skb))
2031 goto out_kfree_skb;
2032
2033 /* If packet is not checksummed and device does not
2034 * support checksumming for this protocol, complete
2035 * checksumming here.
2036 */
2037 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2038 skb_set_transport_header(skb, skb->csum_start -
2039 skb_headroom(skb));
2040 if (!dev_can_checksum(dev, skb) &&
2041 skb_checksum_help(skb))
2042 goto out_kfree_skb;
2043 }
9ccb8975
DM
2044 }
2045
ac45f602 2046 rc = ops->ndo_start_xmit(skb, dev);
cf66ba58 2047 trace_net_dev_xmit(skb, rc);
ec634fe3 2048 if (rc == NETDEV_TX_OK)
08baf561 2049 txq_trans_update(txq);
ac45f602 2050 return rc;
f6a78bfc
HX
2051 }
2052
576a30eb 2053gso:
f6a78bfc
HX
2054 do {
2055 struct sk_buff *nskb = skb->next;
f6a78bfc
HX
2056
2057 skb->next = nskb->next;
2058 nskb->next = NULL;
068a2de5
KK
2059
2060 /*
2061 * If device doesnt need nskb->dst, release it right now while
2062 * its hot in this cpu cache
2063 */
2064 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2065 skb_dst_drop(nskb);
2066
00829823 2067 rc = ops->ndo_start_xmit(nskb, dev);
cf66ba58 2068 trace_net_dev_xmit(nskb, rc);
ec634fe3 2069 if (unlikely(rc != NETDEV_TX_OK)) {
572a9d7b
PM
2070 if (rc & ~NETDEV_TX_MASK)
2071 goto out_kfree_gso_skb;
f54d9e8d 2072 nskb->next = skb->next;
f6a78bfc
HX
2073 skb->next = nskb;
2074 return rc;
2075 }
08baf561 2076 txq_trans_update(txq);
fd2ea0a7 2077 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
f54d9e8d 2078 return NETDEV_TX_BUSY;
f6a78bfc 2079 } while (skb->next);
4ec93edb 2080
572a9d7b
PM
2081out_kfree_gso_skb:
2082 if (likely(skb->next == NULL))
2083 skb->destructor = DEV_GSO_CB(skb)->destructor;
f6a78bfc
HX
2084out_kfree_skb:
2085 kfree_skb(skb);
7b9c6090 2086out:
572a9d7b 2087 return rc;
f6a78bfc
HX
2088}
2089
0a9627f2 2090static u32 hashrnd __read_mostly;
b6b2fed1 2091
9247744e 2092u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
8f0f2223 2093{
7019298a 2094 u32 hash;
b6b2fed1 2095
513de11b
DM
2096 if (skb_rx_queue_recorded(skb)) {
2097 hash = skb_get_rx_queue(skb);
d1b19dff 2098 while (unlikely(hash >= dev->real_num_tx_queues))
513de11b
DM
2099 hash -= dev->real_num_tx_queues;
2100 return hash;
2101 }
ec581f6a
ED
2102
2103 if (skb->sk && skb->sk->sk_hash)
7019298a 2104 hash = skb->sk->sk_hash;
ec581f6a 2105 else
87fd308c 2106 hash = (__force u16) skb->protocol ^ skb->rxhash;
0a9627f2 2107 hash = jhash_1word(hash, hashrnd);
b6b2fed1
DM
2108
2109 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
8f0f2223 2110}
9247744e 2111EXPORT_SYMBOL(skb_tx_hash);
8f0f2223 2112
ed04642f
ED
2113static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2114{
2115 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2116 if (net_ratelimit()) {
7a161ea9
ED
2117 pr_warning("%s selects TX queue %d, but "
2118 "real number of TX queues is %d\n",
2119 dev->name, queue_index, dev->real_num_tx_queues);
ed04642f
ED
2120 }
2121 return 0;
2122 }
2123 return queue_index;
2124}
2125
e8a0464c
DM
2126static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2127 struct sk_buff *skb)
2128{
b0f77d0e 2129 int queue_index;
deabc772 2130 const struct net_device_ops *ops = dev->netdev_ops;
a4ee3ce3 2131
deabc772
HS
2132 if (ops->ndo_select_queue) {
2133 queue_index = ops->ndo_select_queue(dev, skb);
2134 queue_index = dev_cap_txqueue(dev, queue_index);
2135 } else {
2136 struct sock *sk = skb->sk;
2137 queue_index = sk_tx_queue_get(sk);
2138 if (queue_index < 0) {
a4ee3ce3 2139
a4ee3ce3
KK
2140 queue_index = 0;
2141 if (dev->real_num_tx_queues > 1)
2142 queue_index = skb_tx_hash(dev, skb);
fd2ea0a7 2143
8728c544 2144 if (sk) {
87eb3670 2145 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
8728c544
ED
2146
2147 if (dst && skb_dst(skb) == dst)
2148 sk_tx_queue_set(sk, queue_index);
2149 }
a4ee3ce3
KK
2150 }
2151 }
eae792b7 2152
fd2ea0a7
DM
2153 skb_set_queue_mapping(skb, queue_index);
2154 return netdev_get_tx_queue(dev, queue_index);
e8a0464c
DM
2155}
2156
bbd8a0d3
KK
2157static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2158 struct net_device *dev,
2159 struct netdev_queue *txq)
2160{
2161 spinlock_t *root_lock = qdisc_lock(q);
79640a4c 2162 bool contended = qdisc_is_running(q);
bbd8a0d3
KK
2163 int rc;
2164
79640a4c
ED
2165 /*
2166 * Heuristic to force contended enqueues to serialize on a
2167 * separate lock before trying to get qdisc main lock.
2168 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2169 * and dequeue packets faster.
2170 */
2171 if (unlikely(contended))
2172 spin_lock(&q->busylock);
2173
bbd8a0d3
KK
2174 spin_lock(root_lock);
2175 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2176 kfree_skb(skb);
2177 rc = NET_XMIT_DROP;
2178 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 2179 qdisc_run_begin(q)) {
bbd8a0d3
KK
2180 /*
2181 * This is a work-conserving queue; there are no old skbs
2182 * waiting to be sent out; and the qdisc is not running -
2183 * xmit the skb directly.
2184 */
7fee226a
ED
2185 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2186 skb_dst_force(skb);
bbd8a0d3 2187 __qdisc_update_bstats(q, skb->len);
79640a4c
ED
2188 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2189 if (unlikely(contended)) {
2190 spin_unlock(&q->busylock);
2191 contended = false;
2192 }
bbd8a0d3 2193 __qdisc_run(q);
79640a4c 2194 } else
bc135b23 2195 qdisc_run_end(q);
bbd8a0d3
KK
2196
2197 rc = NET_XMIT_SUCCESS;
2198 } else {
7fee226a 2199 skb_dst_force(skb);
bbd8a0d3 2200 rc = qdisc_enqueue_root(skb, q);
79640a4c
ED
2201 if (qdisc_run_begin(q)) {
2202 if (unlikely(contended)) {
2203 spin_unlock(&q->busylock);
2204 contended = false;
2205 }
2206 __qdisc_run(q);
2207 }
bbd8a0d3
KK
2208 }
2209 spin_unlock(root_lock);
79640a4c
ED
2210 if (unlikely(contended))
2211 spin_unlock(&q->busylock);
bbd8a0d3
KK
2212 return rc;
2213}
2214
745e20f1
ED
2215static DEFINE_PER_CPU(int, xmit_recursion);
2216#define RECURSION_LIMIT 3
2217
d29f749e
DJ
2218/**
2219 * dev_queue_xmit - transmit a buffer
2220 * @skb: buffer to transmit
2221 *
2222 * Queue a buffer for transmission to a network device. The caller must
2223 * have set the device and priority and built the buffer before calling
2224 * this function. The function can be called from an interrupt.
2225 *
2226 * A negative errno code is returned on a failure. A success does not
2227 * guarantee the frame will be transmitted as it may be dropped due
2228 * to congestion or traffic shaping.
2229 *
2230 * -----------------------------------------------------------------------------------
2231 * I notice this method can also return errors from the queue disciplines,
2232 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2233 * be positive.
2234 *
2235 * Regardless of the return value, the skb is consumed, so it is currently
2236 * difficult to retry a send to this method. (You can bump the ref count
2237 * before sending to hold a reference for retry if you are careful.)
2238 *
2239 * When calling this method, interrupts MUST be enabled. This is because
2240 * the BH enable code must have IRQs enabled so that it will not deadlock.
2241 * --BLG
2242 */
1da177e4
LT
2243int dev_queue_xmit(struct sk_buff *skb)
2244{
2245 struct net_device *dev = skb->dev;
dc2b4847 2246 struct netdev_queue *txq;
1da177e4
LT
2247 struct Qdisc *q;
2248 int rc = -ENOMEM;
2249
4ec93edb
YH
2250 /* Disable soft irqs for various locks below. Also
2251 * stops preemption for RCU.
1da177e4 2252 */
4ec93edb 2253 rcu_read_lock_bh();
1da177e4 2254
eae792b7 2255 txq = dev_pick_tx(dev, skb);
a898def2 2256 q = rcu_dereference_bh(txq->qdisc);
37437bb2 2257
1da177e4 2258#ifdef CONFIG_NET_CLS_ACT
d1b19dff 2259 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4 2260#endif
cf66ba58 2261 trace_net_dev_queue(skb);
1da177e4 2262 if (q->enqueue) {
bbd8a0d3 2263 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 2264 goto out;
1da177e4
LT
2265 }
2266
2267 /* The device has no queue. Common case for software devices:
2268 loopback, all the sorts of tunnels...
2269
932ff279
HX
2270 Really, it is unlikely that netif_tx_lock protection is necessary
2271 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
2272 counters.)
2273 However, it is possible, that they rely on protection
2274 made by us here.
2275
2276 Check this and shot the lock. It is not prone from deadlocks.
2277 Either shot noqueue qdisc, it is even simpler 8)
2278 */
2279 if (dev->flags & IFF_UP) {
2280 int cpu = smp_processor_id(); /* ok because BHs are off */
2281
c773e847 2282 if (txq->xmit_lock_owner != cpu) {
1da177e4 2283
745e20f1
ED
2284 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2285 goto recursion_alert;
2286
c773e847 2287 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 2288
fd2ea0a7 2289 if (!netif_tx_queue_stopped(txq)) {
745e20f1 2290 __this_cpu_inc(xmit_recursion);
572a9d7b 2291 rc = dev_hard_start_xmit(skb, dev, txq);
745e20f1 2292 __this_cpu_dec(xmit_recursion);
572a9d7b 2293 if (dev_xmit_complete(rc)) {
c773e847 2294 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2295 goto out;
2296 }
2297 }
c773e847 2298 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2299 if (net_ratelimit())
2300 printk(KERN_CRIT "Virtual device %s asks to "
2301 "queue packet!\n", dev->name);
2302 } else {
2303 /* Recursion is detected! It is possible,
745e20f1
ED
2304 * unfortunately
2305 */
2306recursion_alert:
1da177e4
LT
2307 if (net_ratelimit())
2308 printk(KERN_CRIT "Dead loop on virtual device "
2309 "%s, fix it urgently!\n", dev->name);
2310 }
2311 }
2312
2313 rc = -ENETDOWN;
d4828d85 2314 rcu_read_unlock_bh();
1da177e4 2315
1da177e4
LT
2316 kfree_skb(skb);
2317 return rc;
2318out:
d4828d85 2319 rcu_read_unlock_bh();
1da177e4
LT
2320 return rc;
2321}
d1b19dff 2322EXPORT_SYMBOL(dev_queue_xmit);
1da177e4
LT
2323
2324
2325/*=======================================================================
2326 Receiver routines
2327 =======================================================================*/
2328
6b2bedc3 2329int netdev_max_backlog __read_mostly = 1000;
3b098e2d 2330int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
2331int netdev_budget __read_mostly = 300;
2332int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 2333
eecfd7c4
ED
2334/* Called with irq disabled */
2335static inline void ____napi_schedule(struct softnet_data *sd,
2336 struct napi_struct *napi)
2337{
2338 list_add_tail(&napi->poll_list, &sd->poll_list);
2339 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2340}
2341
0a9627f2 2342/*
bfb564e7
KK
2343 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2344 * and src/dst port numbers. Returns a non-zero hash number on success
2345 * and 0 on failure.
0a9627f2 2346 */
bfb564e7 2347__u32 __skb_get_rxhash(struct sk_buff *skb)
0a9627f2 2348{
12fcdefb 2349 int nhoff, hash = 0, poff;
0a9627f2
TH
2350 struct ipv6hdr *ip6;
2351 struct iphdr *ip;
0a9627f2 2352 u8 ip_proto;
8c52d509
CG
2353 u32 addr1, addr2, ihl;
2354 union {
2355 u32 v32;
2356 u16 v16[2];
2357 } ports;
0a9627f2 2358
bfb564e7 2359 nhoff = skb_network_offset(skb);
0a9627f2
TH
2360
2361 switch (skb->protocol) {
2362 case __constant_htons(ETH_P_IP):
bfb564e7 2363 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
0a9627f2
TH
2364 goto done;
2365
1003489e 2366 ip = (struct iphdr *) (skb->data + nhoff);
dbe5775b
CG
2367 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2368 ip_proto = 0;
2369 else
2370 ip_proto = ip->protocol;
b249dcb8
ED
2371 addr1 = (__force u32) ip->saddr;
2372 addr2 = (__force u32) ip->daddr;
0a9627f2
TH
2373 ihl = ip->ihl;
2374 break;
2375 case __constant_htons(ETH_P_IPV6):
bfb564e7 2376 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
0a9627f2
TH
2377 goto done;
2378
1003489e 2379 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
0a9627f2 2380 ip_proto = ip6->nexthdr;
b249dcb8
ED
2381 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2382 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
0a9627f2
TH
2383 ihl = (40 >> 2);
2384 break;
2385 default:
2386 goto done;
2387 }
bfb564e7 2388
12fcdefb
CG
2389 ports.v32 = 0;
2390 poff = proto_ports_offset(ip_proto);
2391 if (poff >= 0) {
2392 nhoff += ihl * 4 + poff;
2393 if (pskb_may_pull(skb, nhoff + 4)) {
2394 ports.v32 = * (__force u32 *) (skb->data + nhoff);
8c52d509
CG
2395 if (ports.v16[1] < ports.v16[0])
2396 swap(ports.v16[0], ports.v16[1]);
b249dcb8 2397 }
0a9627f2
TH
2398 }
2399
b249dcb8
ED
2400 /* get a consistent hash (same value on both flow directions) */
2401 if (addr2 < addr1)
2402 swap(addr1, addr2);
0a9627f2 2403
bfb564e7
KK
2404 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2405 if (!hash)
2406 hash = 1;
2407
2408done:
2409 return hash;
2410}
2411EXPORT_SYMBOL(__skb_get_rxhash);
2412
2413#ifdef CONFIG_RPS
2414
2415/* One global table that all flow-based protocols share. */
2416struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2417EXPORT_SYMBOL(rps_sock_flow_table);
2418
2419/*
2420 * get_rps_cpu is called from netif_receive_skb and returns the target
2421 * CPU from the RPS map of the receiving queue for a given skb.
2422 * rcu_read_lock must be held on entry.
2423 */
2424static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2425 struct rps_dev_flow **rflowp)
2426{
2427 struct netdev_rx_queue *rxqueue;
6febfca9 2428 struct rps_map *map = NULL;
bfb564e7
KK
2429 struct rps_dev_flow_table *flow_table;
2430 struct rps_sock_flow_table *sock_flow_table;
2431 int cpu = -1;
2432 u16 tcpu;
2433
2434 if (skb_rx_queue_recorded(skb)) {
2435 u16 index = skb_get_rx_queue(skb);
62fe0b40
BH
2436 if (unlikely(index >= dev->real_num_rx_queues)) {
2437 WARN_ONCE(dev->real_num_rx_queues > 1,
2438 "%s received packet on queue %u, but number "
2439 "of RX queues is %u\n",
2440 dev->name, index, dev->real_num_rx_queues);
bfb564e7
KK
2441 goto done;
2442 }
2443 rxqueue = dev->_rx + index;
2444 } else
2445 rxqueue = dev->_rx;
2446
6febfca9
CG
2447 if (rxqueue->rps_map) {
2448 map = rcu_dereference(rxqueue->rps_map);
2449 if (map && map->len == 1) {
2450 tcpu = map->cpus[0];
2451 if (cpu_online(tcpu))
2452 cpu = tcpu;
2453 goto done;
2454 }
2455 } else if (!rxqueue->rps_flow_table) {
bfb564e7 2456 goto done;
6febfca9 2457 }
bfb564e7 2458
2d47b459 2459 skb_reset_network_header(skb);
bfb564e7
KK
2460 if (!skb_get_rxhash(skb))
2461 goto done;
2462
fec5e652
TH
2463 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2464 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2465 if (flow_table && sock_flow_table) {
2466 u16 next_cpu;
2467 struct rps_dev_flow *rflow;
2468
2469 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2470 tcpu = rflow->cpu;
2471
2472 next_cpu = sock_flow_table->ents[skb->rxhash &
2473 sock_flow_table->mask];
2474
2475 /*
2476 * If the desired CPU (where last recvmsg was done) is
2477 * different from current CPU (one in the rx-queue flow
2478 * table entry), switch if one of the following holds:
2479 * - Current CPU is unset (equal to RPS_NO_CPU).
2480 * - Current CPU is offline.
2481 * - The current CPU's queue tail has advanced beyond the
2482 * last packet that was enqueued using this table entry.
2483 * This guarantees that all previous packets for the flow
2484 * have been dequeued, thus preserving in order delivery.
2485 */
2486 if (unlikely(tcpu != next_cpu) &&
2487 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2488 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2489 rflow->last_qtail)) >= 0)) {
2490 tcpu = rflow->cpu = next_cpu;
2491 if (tcpu != RPS_NO_CPU)
2492 rflow->last_qtail = per_cpu(softnet_data,
2493 tcpu).input_queue_head;
2494 }
2495 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2496 *rflowp = rflow;
2497 cpu = tcpu;
2498 goto done;
2499 }
2500 }
2501
0a9627f2 2502 if (map) {
fec5e652 2503 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
0a9627f2
TH
2504
2505 if (cpu_online(tcpu)) {
2506 cpu = tcpu;
2507 goto done;
2508 }
2509 }
2510
2511done:
0a9627f2
TH
2512 return cpu;
2513}
2514
0a9627f2 2515/* Called from hardirq (IPI) context */
e36fa2f7 2516static void rps_trigger_softirq(void *data)
0a9627f2 2517{
e36fa2f7
ED
2518 struct softnet_data *sd = data;
2519
eecfd7c4 2520 ____napi_schedule(sd, &sd->backlog);
dee42870 2521 sd->received_rps++;
0a9627f2 2522}
e36fa2f7 2523
fec5e652 2524#endif /* CONFIG_RPS */
0a9627f2 2525
e36fa2f7
ED
2526/*
2527 * Check if this softnet_data structure is another cpu one
2528 * If yes, queue it to our IPI list and return 1
2529 * If no, return 0
2530 */
2531static int rps_ipi_queued(struct softnet_data *sd)
2532{
2533#ifdef CONFIG_RPS
2534 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2535
2536 if (sd != mysd) {
2537 sd->rps_ipi_next = mysd->rps_ipi_list;
2538 mysd->rps_ipi_list = sd;
2539
2540 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2541 return 1;
2542 }
2543#endif /* CONFIG_RPS */
2544 return 0;
2545}
2546
0a9627f2
TH
2547/*
2548 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2549 * queue (may be a remote CPU queue).
2550 */
fec5e652
TH
2551static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2552 unsigned int *qtail)
0a9627f2 2553{
e36fa2f7 2554 struct softnet_data *sd;
0a9627f2
TH
2555 unsigned long flags;
2556
e36fa2f7 2557 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
2558
2559 local_irq_save(flags);
0a9627f2 2560
e36fa2f7 2561 rps_lock(sd);
6e7676c1
CG
2562 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2563 if (skb_queue_len(&sd->input_pkt_queue)) {
0a9627f2 2564enqueue:
e36fa2f7 2565 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 2566 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 2567 rps_unlock(sd);
152102c7 2568 local_irq_restore(flags);
0a9627f2
TH
2569 return NET_RX_SUCCESS;
2570 }
2571
ebda37c2
ED
2572 /* Schedule NAPI for backlog device
2573 * We can use non atomic operation since we own the queue lock
2574 */
2575 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 2576 if (!rps_ipi_queued(sd))
eecfd7c4 2577 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
2578 }
2579 goto enqueue;
2580 }
2581
dee42870 2582 sd->dropped++;
e36fa2f7 2583 rps_unlock(sd);
0a9627f2 2584
0a9627f2
TH
2585 local_irq_restore(flags);
2586
caf586e5 2587 atomic_long_inc(&skb->dev->rx_dropped);
0a9627f2
TH
2588 kfree_skb(skb);
2589 return NET_RX_DROP;
2590}
1da177e4 2591
1da177e4
LT
2592/**
2593 * netif_rx - post buffer to the network code
2594 * @skb: buffer to post
2595 *
2596 * This function receives a packet from a device driver and queues it for
2597 * the upper (protocol) levels to process. It always succeeds. The buffer
2598 * may be dropped during processing for congestion control or by the
2599 * protocol layers.
2600 *
2601 * return values:
2602 * NET_RX_SUCCESS (no congestion)
1da177e4
LT
2603 * NET_RX_DROP (packet was dropped)
2604 *
2605 */
2606
2607int netif_rx(struct sk_buff *skb)
2608{
b0e28f1e 2609 int ret;
1da177e4
LT
2610
2611 /* if netpoll wants it, pretend we never saw it */
2612 if (netpoll_rx(skb))
2613 return NET_RX_DROP;
2614
3b098e2d
ED
2615 if (netdev_tstamp_prequeue)
2616 net_timestamp_check(skb);
1da177e4 2617
cf66ba58 2618 trace_netif_rx(skb);
df334545 2619#ifdef CONFIG_RPS
b0e28f1e 2620 {
fec5e652 2621 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
2622 int cpu;
2623
cece1945 2624 preempt_disable();
b0e28f1e 2625 rcu_read_lock();
fec5e652
TH
2626
2627 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
2628 if (cpu < 0)
2629 cpu = smp_processor_id();
fec5e652
TH
2630
2631 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2632
b0e28f1e 2633 rcu_read_unlock();
cece1945 2634 preempt_enable();
b0e28f1e 2635 }
1e94d72f 2636#else
fec5e652
TH
2637 {
2638 unsigned int qtail;
2639 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2640 put_cpu();
2641 }
1e94d72f 2642#endif
b0e28f1e 2643 return ret;
1da177e4 2644}
d1b19dff 2645EXPORT_SYMBOL(netif_rx);
1da177e4
LT
2646
2647int netif_rx_ni(struct sk_buff *skb)
2648{
2649 int err;
2650
2651 preempt_disable();
2652 err = netif_rx(skb);
2653 if (local_softirq_pending())
2654 do_softirq();
2655 preempt_enable();
2656
2657 return err;
2658}
1da177e4
LT
2659EXPORT_SYMBOL(netif_rx_ni);
2660
1da177e4
LT
2661static void net_tx_action(struct softirq_action *h)
2662{
2663 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2664
2665 if (sd->completion_queue) {
2666 struct sk_buff *clist;
2667
2668 local_irq_disable();
2669 clist = sd->completion_queue;
2670 sd->completion_queue = NULL;
2671 local_irq_enable();
2672
2673 while (clist) {
2674 struct sk_buff *skb = clist;
2675 clist = clist->next;
2676
547b792c 2677 WARN_ON(atomic_read(&skb->users));
07dc22e7 2678 trace_kfree_skb(skb, net_tx_action);
1da177e4
LT
2679 __kfree_skb(skb);
2680 }
2681 }
2682
2683 if (sd->output_queue) {
37437bb2 2684 struct Qdisc *head;
1da177e4
LT
2685
2686 local_irq_disable();
2687 head = sd->output_queue;
2688 sd->output_queue = NULL;
a9cbd588 2689 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
2690 local_irq_enable();
2691
2692 while (head) {
37437bb2
DM
2693 struct Qdisc *q = head;
2694 spinlock_t *root_lock;
2695
1da177e4
LT
2696 head = head->next_sched;
2697
5fb66229 2698 root_lock = qdisc_lock(q);
37437bb2 2699 if (spin_trylock(root_lock)) {
def82a1d
JP
2700 smp_mb__before_clear_bit();
2701 clear_bit(__QDISC_STATE_SCHED,
2702 &q->state);
37437bb2
DM
2703 qdisc_run(q);
2704 spin_unlock(root_lock);
1da177e4 2705 } else {
195648bb 2706 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 2707 &q->state)) {
195648bb 2708 __netif_reschedule(q);
e8a83e10
JP
2709 } else {
2710 smp_mb__before_clear_bit();
2711 clear_bit(__QDISC_STATE_SCHED,
2712 &q->state);
2713 }
1da177e4
LT
2714 }
2715 }
2716 }
2717}
2718
6f05f629
SH
2719static inline int deliver_skb(struct sk_buff *skb,
2720 struct packet_type *pt_prev,
2721 struct net_device *orig_dev)
1da177e4
LT
2722{
2723 atomic_inc(&skb->users);
f2ccd8fa 2724 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4
LT
2725}
2726
ab95bfe0
JP
2727#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2728 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
da678292
MM
2729/* This hook is defined here for ATM LANE */
2730int (*br_fdb_test_addr_hook)(struct net_device *dev,
2731 unsigned char *addr) __read_mostly;
4fb019a0 2732EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 2733#endif
1da177e4 2734
1da177e4
LT
2735#ifdef CONFIG_NET_CLS_ACT
2736/* TODO: Maybe we should just force sch_ingress to be compiled in
2737 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2738 * a compare and 2 stores extra right now if we dont have it on
2739 * but have CONFIG_NET_CLS_ACT
4ec93edb 2740 * NOTE: This doesnt stop any functionality; if you dont have
1da177e4
LT
2741 * the ingress scheduler, you just cant add policies on ingress.
2742 *
2743 */
24824a09 2744static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
1da177e4 2745{
1da177e4 2746 struct net_device *dev = skb->dev;
f697c3e8 2747 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
2748 int result = TC_ACT_OK;
2749 struct Qdisc *q;
4ec93edb 2750
de384830
SH
2751 if (unlikely(MAX_RED_LOOP < ttl++)) {
2752 if (net_ratelimit())
2753 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2754 skb->skb_iif, dev->ifindex);
f697c3e8
HX
2755 return TC_ACT_SHOT;
2756 }
1da177e4 2757
f697c3e8
HX
2758 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2759 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 2760
83874000 2761 q = rxq->qdisc;
8d50b53d 2762 if (q != &noop_qdisc) {
83874000 2763 spin_lock(qdisc_lock(q));
a9312ae8
DM
2764 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2765 result = qdisc_enqueue_root(skb, q);
83874000
DM
2766 spin_unlock(qdisc_lock(q));
2767 }
f697c3e8
HX
2768
2769 return result;
2770}
86e65da9 2771
f697c3e8
HX
2772static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2773 struct packet_type **pt_prev,
2774 int *ret, struct net_device *orig_dev)
2775{
24824a09
ED
2776 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2777
2778 if (!rxq || rxq->qdisc == &noop_qdisc)
f697c3e8 2779 goto out;
1da177e4 2780
f697c3e8
HX
2781 if (*pt_prev) {
2782 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2783 *pt_prev = NULL;
1da177e4
LT
2784 }
2785
24824a09 2786 switch (ing_filter(skb, rxq)) {
f697c3e8
HX
2787 case TC_ACT_SHOT:
2788 case TC_ACT_STOLEN:
2789 kfree_skb(skb);
2790 return NULL;
2791 }
2792
2793out:
2794 skb->tc_verd = 0;
2795 return skb;
1da177e4
LT
2796}
2797#endif
2798
ab95bfe0
JP
2799/**
2800 * netdev_rx_handler_register - register receive handler
2801 * @dev: device to register a handler for
2802 * @rx_handler: receive handler to register
93e2c32b 2803 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0
JP
2804 *
2805 * Register a receive hander for a device. This handler will then be
2806 * called from __netif_receive_skb. A negative errno code is returned
2807 * on a failure.
2808 *
2809 * The caller must hold the rtnl_mutex.
2810 */
2811int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
2812 rx_handler_func_t *rx_handler,
2813 void *rx_handler_data)
ab95bfe0
JP
2814{
2815 ASSERT_RTNL();
2816
2817 if (dev->rx_handler)
2818 return -EBUSY;
2819
93e2c32b 2820 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
2821 rcu_assign_pointer(dev->rx_handler, rx_handler);
2822
2823 return 0;
2824}
2825EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2826
2827/**
2828 * netdev_rx_handler_unregister - unregister receive handler
2829 * @dev: device to unregister a handler from
2830 *
2831 * Unregister a receive hander from a device.
2832 *
2833 * The caller must hold the rtnl_mutex.
2834 */
2835void netdev_rx_handler_unregister(struct net_device *dev)
2836{
2837
2838 ASSERT_RTNL();
2839 rcu_assign_pointer(dev->rx_handler, NULL);
93e2c32b 2840 rcu_assign_pointer(dev->rx_handler_data, NULL);
ab95bfe0
JP
2841}
2842EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2843
acbbc071
ED
2844static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2845 struct net_device *master)
2846{
2847 if (skb->pkt_type == PACKET_HOST) {
2848 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2849
2850 memcpy(dest, master->dev_addr, ETH_ALEN);
2851 }
2852}
2853
2854/* On bonding slaves other than the currently active slave, suppress
2855 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2856 * ARP on active-backup slaves with arp_validate enabled.
2857 */
2858int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2859{
2860 struct net_device *dev = skb->dev;
2861
2862 if (master->priv_flags & IFF_MASTER_ARPMON)
2863 dev->last_rx = jiffies;
2864
f350a0a8
JP
2865 if ((master->priv_flags & IFF_MASTER_ALB) &&
2866 (master->priv_flags & IFF_BRIDGE_PORT)) {
acbbc071
ED
2867 /* Do address unmangle. The local destination address
2868 * will be always the one master has. Provides the right
2869 * functionality in a bridge.
2870 */
2871 skb_bond_set_mac_by_master(skb, master);
2872 }
2873
2874 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2875 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2876 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2877 return 0;
2878
2879 if (master->priv_flags & IFF_MASTER_ALB) {
2880 if (skb->pkt_type != PACKET_BROADCAST &&
2881 skb->pkt_type != PACKET_MULTICAST)
2882 return 0;
2883 }
2884 if (master->priv_flags & IFF_MASTER_8023AD &&
2885 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2886 return 0;
2887
2888 return 1;
2889 }
2890 return 0;
2891}
2892EXPORT_SYMBOL(__skb_bond_should_drop);
2893
10f744d2 2894static int __netif_receive_skb(struct sk_buff *skb)
1da177e4
LT
2895{
2896 struct packet_type *ptype, *pt_prev;
ab95bfe0 2897 rx_handler_func_t *rx_handler;
f2ccd8fa 2898 struct net_device *orig_dev;
0641e4fb 2899 struct net_device *master;
0d7a3681 2900 struct net_device *null_or_orig;
2df4a0fa 2901 struct net_device *orig_or_bond;
1da177e4 2902 int ret = NET_RX_DROP;
252e3346 2903 __be16 type;
1da177e4 2904
3b098e2d
ED
2905 if (!netdev_tstamp_prequeue)
2906 net_timestamp_check(skb);
81bbb3d4 2907
cf66ba58 2908 trace_netif_receive_skb(skb);
9b22ea56 2909
1da177e4 2910 /* if we've gotten here through NAPI, check netpoll */
bea3348e 2911 if (netpoll_receive_skb(skb))
1da177e4
LT
2912 return NET_RX_DROP;
2913
8964be4a
ED
2914 if (!skb->skb_iif)
2915 skb->skb_iif = skb->dev->ifindex;
86e65da9 2916
597a264b
JF
2917 /*
2918 * bonding note: skbs received on inactive slaves should only
2919 * be delivered to pkt handlers that are exact matches. Also
2920 * the deliver_no_wcard flag will be set. If packet handlers
2921 * are sensitive to duplicate packets these skbs will need to
3701e513 2922 * be dropped at the handler.
597a264b 2923 */
0d7a3681 2924 null_or_orig = NULL;
cc9bd5ce 2925 orig_dev = skb->dev;
0641e4fb 2926 master = ACCESS_ONCE(orig_dev->master);
597a264b
JF
2927 if (skb->deliver_no_wcard)
2928 null_or_orig = orig_dev;
2929 else if (master) {
2930 if (skb_bond_should_drop(skb, master)) {
2931 skb->deliver_no_wcard = 1;
0d7a3681 2932 null_or_orig = orig_dev; /* deliver only exact match */
597a264b 2933 } else
0641e4fb 2934 skb->dev = master;
cc9bd5ce 2935 }
8f903c70 2936
27f39c73 2937 __this_cpu_inc(softnet_data.processed);
c1d2bbe1 2938 skb_reset_network_header(skb);
badff6d0 2939 skb_reset_transport_header(skb);
b0e380b1 2940 skb->mac_len = skb->network_header - skb->mac_header;
1da177e4
LT
2941
2942 pt_prev = NULL;
2943
2944 rcu_read_lock();
2945
2946#ifdef CONFIG_NET_CLS_ACT
2947 if (skb->tc_verd & TC_NCLS) {
2948 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2949 goto ncls;
2950 }
2951#endif
2952
2953 list_for_each_entry_rcu(ptype, &ptype_all, list) {
f982307f
JE
2954 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2955 ptype->dev == orig_dev) {
4ec93edb 2956 if (pt_prev)
f2ccd8fa 2957 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
2958 pt_prev = ptype;
2959 }
2960 }
2961
2962#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
2963 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2964 if (!skb)
1da177e4 2965 goto out;
1da177e4
LT
2966ncls:
2967#endif
2968
ab95bfe0
JP
2969 /* Handle special case of bridge or macvlan */
2970 rx_handler = rcu_dereference(skb->dev->rx_handler);
2971 if (rx_handler) {
2972 if (pt_prev) {
2973 ret = deliver_skb(skb, pt_prev, orig_dev);
2974 pt_prev = NULL;
2975 }
2976 skb = rx_handler(skb);
2977 if (!skb)
2978 goto out;
2979 }
1da177e4 2980
3701e513
JG
2981 if (vlan_tx_tag_present(skb)) {
2982 if (pt_prev) {
2983 ret = deliver_skb(skb, pt_prev, orig_dev);
2984 pt_prev = NULL;
2985 }
2986 if (vlan_hwaccel_do_receive(&skb)) {
2987 ret = __netif_receive_skb(skb);
2988 goto out;
2989 } else if (unlikely(!skb))
2990 goto out;
2991 }
2992
1f3c8804
AG
2993 /*
2994 * Make sure frames received on VLAN interfaces stacked on
2995 * bonding interfaces still make their way to any base bonding
2996 * device that may have registered for a specific ptype. The
2997 * handler may have to adjust skb->dev and orig_dev.
1f3c8804 2998 */
2df4a0fa 2999 orig_or_bond = orig_dev;
1f3c8804
AG
3000 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3001 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2df4a0fa 3002 orig_or_bond = vlan_dev_real_dev(skb->dev);
1f3c8804
AG
3003 }
3004
1da177e4 3005 type = skb->protocol;
82d8a867
PE
3006 list_for_each_entry_rcu(ptype,
3007 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1f3c8804 3008 if (ptype->type == type && (ptype->dev == null_or_orig ||
ca8d9ea3 3009 ptype->dev == skb->dev || ptype->dev == orig_dev ||
2df4a0fa 3010 ptype->dev == orig_or_bond)) {
4ec93edb 3011 if (pt_prev)
f2ccd8fa 3012 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3013 pt_prev = ptype;
3014 }
3015 }
3016
3017 if (pt_prev) {
f2ccd8fa 3018 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4 3019 } else {
caf586e5 3020 atomic_long_inc(&skb->dev->rx_dropped);
1da177e4
LT
3021 kfree_skb(skb);
3022 /* Jamal, now you will not able to escape explaining
3023 * me how you were going to use this. :-)
3024 */
3025 ret = NET_RX_DROP;
3026 }
3027
3028out:
3029 rcu_read_unlock();
3030 return ret;
3031}
0a9627f2
TH
3032
3033/**
3034 * netif_receive_skb - process receive buffer from network
3035 * @skb: buffer to process
3036 *
3037 * netif_receive_skb() is the main receive data processing function.
3038 * It always succeeds. The buffer may be dropped during processing
3039 * for congestion control or by the protocol layers.
3040 *
3041 * This function may only be called from softirq context and interrupts
3042 * should be enabled.
3043 *
3044 * Return values (usually ignored):
3045 * NET_RX_SUCCESS: no congestion
3046 * NET_RX_DROP: packet was dropped
3047 */
3048int netif_receive_skb(struct sk_buff *skb)
3049{
3b098e2d
ED
3050 if (netdev_tstamp_prequeue)
3051 net_timestamp_check(skb);
3052
c1f19b51
RC
3053 if (skb_defer_rx_timestamp(skb))
3054 return NET_RX_SUCCESS;
3055
df334545 3056#ifdef CONFIG_RPS
3b098e2d
ED
3057 {
3058 struct rps_dev_flow voidflow, *rflow = &voidflow;
3059 int cpu, ret;
fec5e652 3060
3b098e2d
ED
3061 rcu_read_lock();
3062
3063 cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 3064
3b098e2d
ED
3065 if (cpu >= 0) {
3066 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3067 rcu_read_unlock();
3068 } else {
3069 rcu_read_unlock();
3070 ret = __netif_receive_skb(skb);
3071 }
0a9627f2 3072
3b098e2d 3073 return ret;
fec5e652 3074 }
1e94d72f
TH
3075#else
3076 return __netif_receive_skb(skb);
3077#endif
0a9627f2 3078}
d1b19dff 3079EXPORT_SYMBOL(netif_receive_skb);
1da177e4 3080
88751275
ED
3081/* Network device is going away, flush any packets still pending
3082 * Called with irqs disabled.
3083 */
152102c7 3084static void flush_backlog(void *arg)
6e583ce5 3085{
152102c7 3086 struct net_device *dev = arg;
e36fa2f7 3087 struct softnet_data *sd = &__get_cpu_var(softnet_data);
6e583ce5
SH
3088 struct sk_buff *skb, *tmp;
3089
e36fa2f7 3090 rps_lock(sd);
6e7676c1 3091 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6e583ce5 3092 if (skb->dev == dev) {
e36fa2f7 3093 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 3094 kfree_skb(skb);
76cc8b13 3095 input_queue_head_incr(sd);
6e583ce5 3096 }
6e7676c1 3097 }
e36fa2f7 3098 rps_unlock(sd);
6e7676c1
CG
3099
3100 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3101 if (skb->dev == dev) {
3102 __skb_unlink(skb, &sd->process_queue);
3103 kfree_skb(skb);
76cc8b13 3104 input_queue_head_incr(sd);
6e7676c1
CG
3105 }
3106 }
6e583ce5
SH
3107}
3108
d565b0a1
HX
3109static int napi_gro_complete(struct sk_buff *skb)
3110{
3111 struct packet_type *ptype;
3112 __be16 type = skb->protocol;
3113 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3114 int err = -ENOENT;
3115
fc59f9a3
HX
3116 if (NAPI_GRO_CB(skb)->count == 1) {
3117 skb_shinfo(skb)->gso_size = 0;
d565b0a1 3118 goto out;
fc59f9a3 3119 }
d565b0a1
HX
3120
3121 rcu_read_lock();
3122 list_for_each_entry_rcu(ptype, head, list) {
3123 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3124 continue;
3125
3126 err = ptype->gro_complete(skb);
3127 break;
3128 }
3129 rcu_read_unlock();
3130
3131 if (err) {
3132 WARN_ON(&ptype->list == head);
3133 kfree_skb(skb);
3134 return NET_RX_SUCCESS;
3135 }
3136
3137out:
d565b0a1
HX
3138 return netif_receive_skb(skb);
3139}
3140
86cac58b 3141inline void napi_gro_flush(struct napi_struct *napi)
d565b0a1
HX
3142{
3143 struct sk_buff *skb, *next;
3144
3145 for (skb = napi->gro_list; skb; skb = next) {
3146 next = skb->next;
3147 skb->next = NULL;
3148 napi_gro_complete(skb);
3149 }
3150
4ae5544f 3151 napi->gro_count = 0;
d565b0a1
HX
3152 napi->gro_list = NULL;
3153}
86cac58b 3154EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 3155
5b252f0c 3156enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
3157{
3158 struct sk_buff **pp = NULL;
3159 struct packet_type *ptype;
3160 __be16 type = skb->protocol;
3161 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
0da2afd5 3162 int same_flow;
d565b0a1 3163 int mac_len;
5b252f0c 3164 enum gro_result ret;
d565b0a1 3165
ce9e76c8 3166 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
d565b0a1
HX
3167 goto normal;
3168
21dc3301 3169 if (skb_is_gso(skb) || skb_has_frag_list(skb))
f17f5c91
HX
3170 goto normal;
3171
d565b0a1
HX
3172 rcu_read_lock();
3173 list_for_each_entry_rcu(ptype, head, list) {
d565b0a1
HX
3174 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3175 continue;
3176
86911732 3177 skb_set_network_header(skb, skb_gro_offset(skb));
d565b0a1
HX
3178 mac_len = skb->network_header - skb->mac_header;
3179 skb->mac_len = mac_len;
3180 NAPI_GRO_CB(skb)->same_flow = 0;
3181 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 3182 NAPI_GRO_CB(skb)->free = 0;
d565b0a1 3183
d565b0a1
HX
3184 pp = ptype->gro_receive(&napi->gro_list, skb);
3185 break;
3186 }
3187 rcu_read_unlock();
3188
3189 if (&ptype->list == head)
3190 goto normal;
3191
0da2afd5 3192 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 3193 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 3194
d565b0a1
HX
3195 if (pp) {
3196 struct sk_buff *nskb = *pp;
3197
3198 *pp = nskb->next;
3199 nskb->next = NULL;
3200 napi_gro_complete(nskb);
4ae5544f 3201 napi->gro_count--;
d565b0a1
HX
3202 }
3203
0da2afd5 3204 if (same_flow)
d565b0a1
HX
3205 goto ok;
3206
4ae5544f 3207 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
d565b0a1 3208 goto normal;
d565b0a1 3209
4ae5544f 3210 napi->gro_count++;
d565b0a1 3211 NAPI_GRO_CB(skb)->count = 1;
86911732 3212 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
3213 skb->next = napi->gro_list;
3214 napi->gro_list = skb;
5d0d9be8 3215 ret = GRO_HELD;
d565b0a1 3216
ad0f9904 3217pull:
cb18978c
HX
3218 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3219 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3220
3221 BUG_ON(skb->end - skb->tail < grow);
3222
3223 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3224
3225 skb->tail += grow;
3226 skb->data_len -= grow;
3227
3228 skb_shinfo(skb)->frags[0].page_offset += grow;
3229 skb_shinfo(skb)->frags[0].size -= grow;
3230
3231 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3232 put_page(skb_shinfo(skb)->frags[0].page);
3233 memmove(skb_shinfo(skb)->frags,
3234 skb_shinfo(skb)->frags + 1,
e5093aec 3235 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
cb18978c 3236 }
ad0f9904
HX
3237 }
3238
d565b0a1 3239ok:
5d0d9be8 3240 return ret;
d565b0a1
HX
3241
3242normal:
ad0f9904
HX
3243 ret = GRO_NORMAL;
3244 goto pull;
5d38a079 3245}
96e93eab
HX
3246EXPORT_SYMBOL(dev_gro_receive);
3247
40d0802b 3248static inline gro_result_t
5b252f0c 3249__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
96e93eab
HX
3250{
3251 struct sk_buff *p;
3252
3253 for (p = napi->gro_list; p; p = p->next) {
40d0802b
ED
3254 unsigned long diffs;
3255
3256 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3701e513 3257 diffs |= p->vlan_tci ^ skb->vlan_tci;
40d0802b 3258 diffs |= compare_ether_header(skb_mac_header(p),
f64f9e71 3259 skb_gro_mac_header(skb));
40d0802b 3260 NAPI_GRO_CB(p)->same_flow = !diffs;
96e93eab
HX
3261 NAPI_GRO_CB(p)->flush = 0;
3262 }
3263
3264 return dev_gro_receive(napi, skb);
3265}
5d38a079 3266
c7c4b3b6 3267gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 3268{
5d0d9be8
HX
3269 switch (ret) {
3270 case GRO_NORMAL:
c7c4b3b6
BH
3271 if (netif_receive_skb(skb))
3272 ret = GRO_DROP;
3273 break;
5d38a079 3274
5d0d9be8 3275 case GRO_DROP:
5d0d9be8 3276 case GRO_MERGED_FREE:
5d38a079
HX
3277 kfree_skb(skb);
3278 break;
5b252f0c
BH
3279
3280 case GRO_HELD:
3281 case GRO_MERGED:
3282 break;
5d38a079
HX
3283 }
3284
c7c4b3b6 3285 return ret;
5d0d9be8
HX
3286}
3287EXPORT_SYMBOL(napi_skb_finish);
3288
78a478d0
HX
3289void skb_gro_reset_offset(struct sk_buff *skb)
3290{
3291 NAPI_GRO_CB(skb)->data_offset = 0;
3292 NAPI_GRO_CB(skb)->frag0 = NULL;
7489594c 3293 NAPI_GRO_CB(skb)->frag0_len = 0;
78a478d0 3294
78d3fd0b 3295 if (skb->mac_header == skb->tail &&
7489594c 3296 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
78a478d0
HX
3297 NAPI_GRO_CB(skb)->frag0 =
3298 page_address(skb_shinfo(skb)->frags[0].page) +
3299 skb_shinfo(skb)->frags[0].page_offset;
7489594c
HX
3300 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3301 }
78a478d0
HX
3302}
3303EXPORT_SYMBOL(skb_gro_reset_offset);
3304
c7c4b3b6 3305gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 3306{
86911732
HX
3307 skb_gro_reset_offset(skb);
3308
5d0d9be8 3309 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
d565b0a1
HX
3310}
3311EXPORT_SYMBOL(napi_gro_receive);
3312
d0c2b0d2 3313static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
96e93eab 3314{
96e93eab
HX
3315 __skb_pull(skb, skb_headlen(skb));
3316 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3701e513 3317 skb->vlan_tci = 0;
96e93eab
HX
3318
3319 napi->skb = skb;
3320}
96e93eab 3321
76620aaf 3322struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 3323{
5d38a079 3324 struct sk_buff *skb = napi->skb;
5d38a079
HX
3325
3326 if (!skb) {
89d71a66
ED
3327 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3328 if (skb)
3329 napi->skb = skb;
80595d59 3330 }
96e93eab
HX
3331 return skb;
3332}
76620aaf 3333EXPORT_SYMBOL(napi_get_frags);
96e93eab 3334
c7c4b3b6
BH
3335gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3336 gro_result_t ret)
96e93eab 3337{
5d0d9be8
HX
3338 switch (ret) {
3339 case GRO_NORMAL:
86911732 3340 case GRO_HELD:
e76b69cc 3341 skb->protocol = eth_type_trans(skb, skb->dev);
86911732 3342
c7c4b3b6
BH
3343 if (ret == GRO_HELD)
3344 skb_gro_pull(skb, -ETH_HLEN);
3345 else if (netif_receive_skb(skb))
3346 ret = GRO_DROP;
86911732 3347 break;
5d38a079 3348
5d0d9be8 3349 case GRO_DROP:
5d0d9be8
HX
3350 case GRO_MERGED_FREE:
3351 napi_reuse_skb(napi, skb);
3352 break;
5b252f0c
BH
3353
3354 case GRO_MERGED:
3355 break;
5d0d9be8 3356 }
5d38a079 3357
c7c4b3b6 3358 return ret;
5d38a079 3359}
5d0d9be8
HX
3360EXPORT_SYMBOL(napi_frags_finish);
3361
76620aaf
HX
3362struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3363{
3364 struct sk_buff *skb = napi->skb;
3365 struct ethhdr *eth;
a5b1cf28
HX
3366 unsigned int hlen;
3367 unsigned int off;
76620aaf
HX
3368
3369 napi->skb = NULL;
3370
3371 skb_reset_mac_header(skb);
3372 skb_gro_reset_offset(skb);
3373
a5b1cf28
HX
3374 off = skb_gro_offset(skb);
3375 hlen = off + sizeof(*eth);
3376 eth = skb_gro_header_fast(skb, off);
3377 if (skb_gro_header_hard(skb, hlen)) {
3378 eth = skb_gro_header_slow(skb, hlen, off);
3379 if (unlikely(!eth)) {
3380 napi_reuse_skb(napi, skb);
3381 skb = NULL;
3382 goto out;
3383 }
76620aaf
HX
3384 }
3385
3386 skb_gro_pull(skb, sizeof(*eth));
3387
3388 /*
3389 * This works because the only protocols we care about don't require
3390 * special handling. We'll fix it up properly at the end.
3391 */
3392 skb->protocol = eth->h_proto;
3393
3394out:
3395 return skb;
3396}
3397EXPORT_SYMBOL(napi_frags_skb);
3398
c7c4b3b6 3399gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 3400{
76620aaf 3401 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
3402
3403 if (!skb)
c7c4b3b6 3404 return GRO_DROP;
5d0d9be8
HX
3405
3406 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3407}
5d38a079
HX
3408EXPORT_SYMBOL(napi_gro_frags);
3409
e326bed2
ED
3410/*
3411 * net_rps_action sends any pending IPI's for rps.
3412 * Note: called with local irq disabled, but exits with local irq enabled.
3413 */
3414static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3415{
3416#ifdef CONFIG_RPS
3417 struct softnet_data *remsd = sd->rps_ipi_list;
3418
3419 if (remsd) {
3420 sd->rps_ipi_list = NULL;
3421
3422 local_irq_enable();
3423
3424 /* Send pending IPI's to kick RPS processing on remote cpus. */
3425 while (remsd) {
3426 struct softnet_data *next = remsd->rps_ipi_next;
3427
3428 if (cpu_online(remsd->cpu))
3429 __smp_call_function_single(remsd->cpu,
3430 &remsd->csd, 0);
3431 remsd = next;
3432 }
3433 } else
3434#endif
3435 local_irq_enable();
3436}
3437
bea3348e 3438static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
3439{
3440 int work = 0;
eecfd7c4 3441 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
1da177e4 3442
e326bed2
ED
3443#ifdef CONFIG_RPS
3444 /* Check if we have pending ipi, its better to send them now,
3445 * not waiting net_rx_action() end.
3446 */
3447 if (sd->rps_ipi_list) {
3448 local_irq_disable();
3449 net_rps_action_and_irq_enable(sd);
3450 }
3451#endif
bea3348e 3452 napi->weight = weight_p;
6e7676c1
CG
3453 local_irq_disable();
3454 while (work < quota) {
1da177e4 3455 struct sk_buff *skb;
6e7676c1
CG
3456 unsigned int qlen;
3457
3458 while ((skb = __skb_dequeue(&sd->process_queue))) {
3459 local_irq_enable();
3460 __netif_receive_skb(skb);
6e7676c1 3461 local_irq_disable();
76cc8b13
TH
3462 input_queue_head_incr(sd);
3463 if (++work >= quota) {
3464 local_irq_enable();
3465 return work;
3466 }
6e7676c1 3467 }
1da177e4 3468
e36fa2f7 3469 rps_lock(sd);
6e7676c1 3470 qlen = skb_queue_len(&sd->input_pkt_queue);
76cc8b13 3471 if (qlen)
6e7676c1
CG
3472 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3473 &sd->process_queue);
76cc8b13 3474
6e7676c1 3475 if (qlen < quota - work) {
eecfd7c4
ED
3476 /*
3477 * Inline a custom version of __napi_complete().
3478 * only current cpu owns and manipulates this napi,
3479 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3480 * we can use a plain write instead of clear_bit(),
3481 * and we dont need an smp_mb() memory barrier.
3482 */
3483 list_del(&napi->poll_list);
3484 napi->state = 0;
3485
6e7676c1 3486 quota = work + qlen;
bea3348e 3487 }
e36fa2f7 3488 rps_unlock(sd);
6e7676c1
CG
3489 }
3490 local_irq_enable();
1da177e4 3491
bea3348e
SH
3492 return work;
3493}
1da177e4 3494
bea3348e
SH
3495/**
3496 * __napi_schedule - schedule for receive
c4ea43c5 3497 * @n: entry to schedule
bea3348e
SH
3498 *
3499 * The entry's receive function will be scheduled to run
3500 */
b5606c2d 3501void __napi_schedule(struct napi_struct *n)
bea3348e
SH
3502{
3503 unsigned long flags;
1da177e4 3504
bea3348e 3505 local_irq_save(flags);
eecfd7c4 3506 ____napi_schedule(&__get_cpu_var(softnet_data), n);
bea3348e 3507 local_irq_restore(flags);
1da177e4 3508}
bea3348e
SH
3509EXPORT_SYMBOL(__napi_schedule);
3510
d565b0a1
HX
3511void __napi_complete(struct napi_struct *n)
3512{
3513 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3514 BUG_ON(n->gro_list);
3515
3516 list_del(&n->poll_list);
3517 smp_mb__before_clear_bit();
3518 clear_bit(NAPI_STATE_SCHED, &n->state);
3519}
3520EXPORT_SYMBOL(__napi_complete);
3521
3522void napi_complete(struct napi_struct *n)
3523{
3524 unsigned long flags;
3525
3526 /*
3527 * don't let napi dequeue from the cpu poll list
3528 * just in case its running on a different cpu
3529 */
3530 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3531 return;
3532
3533 napi_gro_flush(n);
3534 local_irq_save(flags);
3535 __napi_complete(n);
3536 local_irq_restore(flags);
3537}
3538EXPORT_SYMBOL(napi_complete);
3539
3540void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3541 int (*poll)(struct napi_struct *, int), int weight)
3542{
3543 INIT_LIST_HEAD(&napi->poll_list);
4ae5544f 3544 napi->gro_count = 0;
d565b0a1 3545 napi->gro_list = NULL;
5d38a079 3546 napi->skb = NULL;
d565b0a1
HX
3547 napi->poll = poll;
3548 napi->weight = weight;
3549 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 3550 napi->dev = dev;
5d38a079 3551#ifdef CONFIG_NETPOLL
d565b0a1
HX
3552 spin_lock_init(&napi->poll_lock);
3553 napi->poll_owner = -1;
3554#endif
3555 set_bit(NAPI_STATE_SCHED, &napi->state);
3556}
3557EXPORT_SYMBOL(netif_napi_add);
3558
3559void netif_napi_del(struct napi_struct *napi)
3560{
3561 struct sk_buff *skb, *next;
3562
d7b06636 3563 list_del_init(&napi->dev_list);
76620aaf 3564 napi_free_frags(napi);
d565b0a1
HX
3565
3566 for (skb = napi->gro_list; skb; skb = next) {
3567 next = skb->next;
3568 skb->next = NULL;
3569 kfree_skb(skb);
3570 }
3571
3572 napi->gro_list = NULL;
4ae5544f 3573 napi->gro_count = 0;
d565b0a1
HX
3574}
3575EXPORT_SYMBOL(netif_napi_del);
3576
1da177e4
LT
3577static void net_rx_action(struct softirq_action *h)
3578{
e326bed2 3579 struct softnet_data *sd = &__get_cpu_var(softnet_data);
24f8b238 3580 unsigned long time_limit = jiffies + 2;
51b0bded 3581 int budget = netdev_budget;
53fb95d3
MM
3582 void *have;
3583
1da177e4
LT
3584 local_irq_disable();
3585
e326bed2 3586 while (!list_empty(&sd->poll_list)) {
bea3348e
SH
3587 struct napi_struct *n;
3588 int work, weight;
1da177e4 3589
bea3348e 3590 /* If softirq window is exhuasted then punt.
24f8b238
SH
3591 * Allow this to run for 2 jiffies since which will allow
3592 * an average latency of 1.5/HZ.
bea3348e 3593 */
24f8b238 3594 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
1da177e4
LT
3595 goto softnet_break;
3596
3597 local_irq_enable();
3598
bea3348e
SH
3599 /* Even though interrupts have been re-enabled, this
3600 * access is safe because interrupts can only add new
3601 * entries to the tail of this list, and only ->poll()
3602 * calls can remove this head entry from the list.
3603 */
e326bed2 3604 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
1da177e4 3605
bea3348e
SH
3606 have = netpoll_poll_lock(n);
3607
3608 weight = n->weight;
3609
0a7606c1
DM
3610 /* This NAPI_STATE_SCHED test is for avoiding a race
3611 * with netpoll's poll_napi(). Only the entity which
3612 * obtains the lock and sees NAPI_STATE_SCHED set will
3613 * actually make the ->poll() call. Therefore we avoid
3614 * accidently calling ->poll() when NAPI is not scheduled.
3615 */
3616 work = 0;
4ea7e386 3617 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
0a7606c1 3618 work = n->poll(n, weight);
4ea7e386
NH
3619 trace_napi_poll(n);
3620 }
bea3348e
SH
3621
3622 WARN_ON_ONCE(work > weight);
3623
3624 budget -= work;
3625
3626 local_irq_disable();
3627
3628 /* Drivers must not modify the NAPI state if they
3629 * consume the entire weight. In such cases this code
3630 * still "owns" the NAPI instance and therefore can
3631 * move the instance around on the list at-will.
3632 */
fed17f30 3633 if (unlikely(work == weight)) {
ff780cd8
HX
3634 if (unlikely(napi_disable_pending(n))) {
3635 local_irq_enable();
3636 napi_complete(n);
3637 local_irq_disable();
3638 } else
e326bed2 3639 list_move_tail(&n->poll_list, &sd->poll_list);
fed17f30 3640 }
bea3348e
SH
3641
3642 netpoll_poll_unlock(have);
1da177e4
LT
3643 }
3644out:
e326bed2 3645 net_rps_action_and_irq_enable(sd);
0a9627f2 3646
db217334
CL
3647#ifdef CONFIG_NET_DMA
3648 /*
3649 * There may not be any more sk_buffs coming right now, so push
3650 * any pending DMA copies to hardware
3651 */
2ba05622 3652 dma_issue_pending_all();
db217334 3653#endif
bea3348e 3654
1da177e4
LT
3655 return;
3656
3657softnet_break:
dee42870 3658 sd->time_squeeze++;
1da177e4
LT
3659 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3660 goto out;
3661}
3662
d1b19dff 3663static gifconf_func_t *gifconf_list[NPROTO];
1da177e4
LT
3664
3665/**
3666 * register_gifconf - register a SIOCGIF handler
3667 * @family: Address family
3668 * @gifconf: Function handler
3669 *
3670 * Register protocol dependent address dumping routines. The handler
3671 * that is passed must not be freed or reused until it has been replaced
3672 * by another handler.
3673 */
d1b19dff 3674int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
1da177e4
LT
3675{
3676 if (family >= NPROTO)
3677 return -EINVAL;
3678 gifconf_list[family] = gifconf;
3679 return 0;
3680}
d1b19dff 3681EXPORT_SYMBOL(register_gifconf);
1da177e4
LT
3682
3683
3684/*
3685 * Map an interface index to its name (SIOCGIFNAME)
3686 */
3687
3688/*
3689 * We need this ioctl for efficient implementation of the
3690 * if_indextoname() function required by the IPv6 API. Without
3691 * it, we would have to search all the interfaces to find a
3692 * match. --pb
3693 */
3694
881d966b 3695static int dev_ifname(struct net *net, struct ifreq __user *arg)
1da177e4
LT
3696{
3697 struct net_device *dev;
3698 struct ifreq ifr;
3699
3700 /*
3701 * Fetch the caller's info block.
3702 */
3703
3704 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3705 return -EFAULT;
3706
fb699dfd
ED
3707 rcu_read_lock();
3708 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
1da177e4 3709 if (!dev) {
fb699dfd 3710 rcu_read_unlock();
1da177e4
LT
3711 return -ENODEV;
3712 }
3713
3714 strcpy(ifr.ifr_name, dev->name);
fb699dfd 3715 rcu_read_unlock();
1da177e4
LT
3716
3717 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3718 return -EFAULT;
3719 return 0;
3720}
3721
3722/*
3723 * Perform a SIOCGIFCONF call. This structure will change
3724 * size eventually, and there is nothing I can do about it.
3725 * Thus we will need a 'compatibility mode'.
3726 */
3727
881d966b 3728static int dev_ifconf(struct net *net, char __user *arg)
1da177e4
LT
3729{
3730 struct ifconf ifc;
3731 struct net_device *dev;
3732 char __user *pos;
3733 int len;
3734 int total;
3735 int i;
3736
3737 /*
3738 * Fetch the caller's info block.
3739 */
3740
3741 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3742 return -EFAULT;
3743
3744 pos = ifc.ifc_buf;
3745 len = ifc.ifc_len;
3746
3747 /*
3748 * Loop over the interfaces, and write an info block for each.
3749 */
3750
3751 total = 0;
881d966b 3752 for_each_netdev(net, dev) {
1da177e4
LT
3753 for (i = 0; i < NPROTO; i++) {
3754 if (gifconf_list[i]) {
3755 int done;
3756 if (!pos)
3757 done = gifconf_list[i](dev, NULL, 0);
3758 else
3759 done = gifconf_list[i](dev, pos + total,
3760 len - total);
3761 if (done < 0)
3762 return -EFAULT;
3763 total += done;
3764 }
3765 }
4ec93edb 3766 }
1da177e4
LT
3767
3768 /*
3769 * All done. Write the updated control block back to the caller.
3770 */
3771 ifc.ifc_len = total;
3772
3773 /*
3774 * Both BSD and Solaris return 0 here, so we do too.
3775 */
3776 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3777}
3778
3779#ifdef CONFIG_PROC_FS
3780/*
3781 * This is invoked by the /proc filesystem handler to display a device
3782 * in detail.
3783 */
7562f876 3784void *dev_seq_start(struct seq_file *seq, loff_t *pos)
c6d14c84 3785 __acquires(RCU)
1da177e4 3786{
e372c414 3787 struct net *net = seq_file_net(seq);
7562f876 3788 loff_t off;
1da177e4 3789 struct net_device *dev;
1da177e4 3790
c6d14c84 3791 rcu_read_lock();
7562f876
PE
3792 if (!*pos)
3793 return SEQ_START_TOKEN;
1da177e4 3794
7562f876 3795 off = 1;
c6d14c84 3796 for_each_netdev_rcu(net, dev)
7562f876
PE
3797 if (off++ == *pos)
3798 return dev;
1da177e4 3799
7562f876 3800 return NULL;
1da177e4
LT
3801}
3802
3803void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3804{
c6d14c84
ED
3805 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3806 first_net_device(seq_file_net(seq)) :
3807 next_net_device((struct net_device *)v);
3808
1da177e4 3809 ++*pos;
c6d14c84 3810 return rcu_dereference(dev);
1da177e4
LT
3811}
3812
3813void dev_seq_stop(struct seq_file *seq, void *v)
c6d14c84 3814 __releases(RCU)
1da177e4 3815{
c6d14c84 3816 rcu_read_unlock();
1da177e4
LT
3817}
3818
3819static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3820{
28172739
ED
3821 struct rtnl_link_stats64 temp;
3822 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
1da177e4 3823
be1f3c2c
BH
3824 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3825 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
5a1b5898
RR
3826 dev->name, stats->rx_bytes, stats->rx_packets,
3827 stats->rx_errors,
3828 stats->rx_dropped + stats->rx_missed_errors,
3829 stats->rx_fifo_errors,
3830 stats->rx_length_errors + stats->rx_over_errors +
3831 stats->rx_crc_errors + stats->rx_frame_errors,
3832 stats->rx_compressed, stats->multicast,
3833 stats->tx_bytes, stats->tx_packets,
3834 stats->tx_errors, stats->tx_dropped,
3835 stats->tx_fifo_errors, stats->collisions,
3836 stats->tx_carrier_errors +
3837 stats->tx_aborted_errors +
3838 stats->tx_window_errors +
3839 stats->tx_heartbeat_errors,
3840 stats->tx_compressed);
1da177e4
LT
3841}
3842
3843/*
3844 * Called from the PROCfs module. This now uses the new arbitrary sized
3845 * /proc/net interface to create /proc/net/dev
3846 */
3847static int dev_seq_show(struct seq_file *seq, void *v)
3848{
3849 if (v == SEQ_START_TOKEN)
3850 seq_puts(seq, "Inter-| Receive "
3851 " | Transmit\n"
3852 " face |bytes packets errs drop fifo frame "
3853 "compressed multicast|bytes packets errs "
3854 "drop fifo colls carrier compressed\n");
3855 else
3856 dev_seq_printf_stats(seq, v);
3857 return 0;
3858}
3859
dee42870 3860static struct softnet_data *softnet_get_online(loff_t *pos)
1da177e4 3861{
dee42870 3862 struct softnet_data *sd = NULL;
1da177e4 3863
0c0b0aca 3864 while (*pos < nr_cpu_ids)
4ec93edb 3865 if (cpu_online(*pos)) {
dee42870 3866 sd = &per_cpu(softnet_data, *pos);
1da177e4
LT
3867 break;
3868 } else
3869 ++*pos;
dee42870 3870 return sd;
1da177e4
LT
3871}
3872
3873static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3874{
3875 return softnet_get_online(pos);
3876}
3877
3878static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3879{
3880 ++*pos;
3881 return softnet_get_online(pos);
3882}
3883
3884static void softnet_seq_stop(struct seq_file *seq, void *v)
3885{
3886}
3887
3888static int softnet_seq_show(struct seq_file *seq, void *v)
3889{
dee42870 3890 struct softnet_data *sd = v;
1da177e4 3891
0a9627f2 3892 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
dee42870 3893 sd->processed, sd->dropped, sd->time_squeeze, 0,
c1ebcdb8 3894 0, 0, 0, 0, /* was fastroute */
dee42870 3895 sd->cpu_collision, sd->received_rps);
1da177e4
LT
3896 return 0;
3897}
3898
f690808e 3899static const struct seq_operations dev_seq_ops = {
1da177e4
LT
3900 .start = dev_seq_start,
3901 .next = dev_seq_next,
3902 .stop = dev_seq_stop,
3903 .show = dev_seq_show,
3904};
3905
3906static int dev_seq_open(struct inode *inode, struct file *file)
3907{
e372c414
DL
3908 return seq_open_net(inode, file, &dev_seq_ops,
3909 sizeof(struct seq_net_private));
1da177e4
LT
3910}
3911
9a32144e 3912static const struct file_operations dev_seq_fops = {
1da177e4
LT
3913 .owner = THIS_MODULE,
3914 .open = dev_seq_open,
3915 .read = seq_read,
3916 .llseek = seq_lseek,
e372c414 3917 .release = seq_release_net,
1da177e4
LT
3918};
3919
f690808e 3920static const struct seq_operations softnet_seq_ops = {
1da177e4
LT
3921 .start = softnet_seq_start,
3922 .next = softnet_seq_next,
3923 .stop = softnet_seq_stop,
3924 .show = softnet_seq_show,
3925};
3926
3927static int softnet_seq_open(struct inode *inode, struct file *file)
3928{
3929 return seq_open(file, &softnet_seq_ops);
3930}
3931
9a32144e 3932static const struct file_operations softnet_seq_fops = {
1da177e4
LT
3933 .owner = THIS_MODULE,
3934 .open = softnet_seq_open,
3935 .read = seq_read,
3936 .llseek = seq_lseek,
3937 .release = seq_release,
3938};
3939
0e1256ff
SH
3940static void *ptype_get_idx(loff_t pos)
3941{
3942 struct packet_type *pt = NULL;
3943 loff_t i = 0;
3944 int t;
3945
3946 list_for_each_entry_rcu(pt, &ptype_all, list) {
3947 if (i == pos)
3948 return pt;
3949 ++i;
3950 }
3951
82d8a867 3952 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
0e1256ff
SH
3953 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3954 if (i == pos)
3955 return pt;
3956 ++i;
3957 }
3958 }
3959 return NULL;
3960}
3961
3962static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
72348a42 3963 __acquires(RCU)
0e1256ff
SH
3964{
3965 rcu_read_lock();
3966 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3967}
3968
3969static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3970{
3971 struct packet_type *pt;
3972 struct list_head *nxt;
3973 int hash;
3974
3975 ++*pos;
3976 if (v == SEQ_START_TOKEN)
3977 return ptype_get_idx(0);
3978
3979 pt = v;
3980 nxt = pt->list.next;
3981 if (pt->type == htons(ETH_P_ALL)) {
3982 if (nxt != &ptype_all)
3983 goto found;
3984 hash = 0;
3985 nxt = ptype_base[0].next;
3986 } else
82d8a867 3987 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
0e1256ff
SH
3988
3989 while (nxt == &ptype_base[hash]) {
82d8a867 3990 if (++hash >= PTYPE_HASH_SIZE)
0e1256ff
SH
3991 return NULL;
3992 nxt = ptype_base[hash].next;
3993 }
3994found:
3995 return list_entry(nxt, struct packet_type, list);
3996}
3997
3998static void ptype_seq_stop(struct seq_file *seq, void *v)
72348a42 3999 __releases(RCU)
0e1256ff
SH
4000{
4001 rcu_read_unlock();
4002}
4003
0e1256ff
SH
4004static int ptype_seq_show(struct seq_file *seq, void *v)
4005{
4006 struct packet_type *pt = v;
4007
4008 if (v == SEQ_START_TOKEN)
4009 seq_puts(seq, "Type Device Function\n");
c346dca1 4010 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
0e1256ff
SH
4011 if (pt->type == htons(ETH_P_ALL))
4012 seq_puts(seq, "ALL ");
4013 else
4014 seq_printf(seq, "%04x", ntohs(pt->type));
4015
908cd2da
AD
4016 seq_printf(seq, " %-8s %pF\n",
4017 pt->dev ? pt->dev->name : "", pt->func);
0e1256ff
SH
4018 }
4019
4020 return 0;
4021}
4022
4023static const struct seq_operations ptype_seq_ops = {
4024 .start = ptype_seq_start,
4025 .next = ptype_seq_next,
4026 .stop = ptype_seq_stop,
4027 .show = ptype_seq_show,
4028};
4029
4030static int ptype_seq_open(struct inode *inode, struct file *file)
4031{
2feb27db
PE
4032 return seq_open_net(inode, file, &ptype_seq_ops,
4033 sizeof(struct seq_net_private));
0e1256ff
SH
4034}
4035
4036static const struct file_operations ptype_seq_fops = {
4037 .owner = THIS_MODULE,
4038 .open = ptype_seq_open,
4039 .read = seq_read,
4040 .llseek = seq_lseek,
2feb27db 4041 .release = seq_release_net,
0e1256ff
SH
4042};
4043
4044
4665079c 4045static int __net_init dev_proc_net_init(struct net *net)
1da177e4
LT
4046{
4047 int rc = -ENOMEM;
4048
881d966b 4049 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
1da177e4 4050 goto out;
881d966b 4051 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
1da177e4 4052 goto out_dev;
881d966b 4053 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
457c4cbc 4054 goto out_softnet;
0e1256ff 4055
881d966b 4056 if (wext_proc_init(net))
457c4cbc 4057 goto out_ptype;
1da177e4
LT
4058 rc = 0;
4059out:
4060 return rc;
457c4cbc 4061out_ptype:
881d966b 4062 proc_net_remove(net, "ptype");
1da177e4 4063out_softnet:
881d966b 4064 proc_net_remove(net, "softnet_stat");
1da177e4 4065out_dev:
881d966b 4066 proc_net_remove(net, "dev");
1da177e4
LT
4067 goto out;
4068}
881d966b 4069
4665079c 4070static void __net_exit dev_proc_net_exit(struct net *net)
881d966b
EB
4071{
4072 wext_proc_exit(net);
4073
4074 proc_net_remove(net, "ptype");
4075 proc_net_remove(net, "softnet_stat");
4076 proc_net_remove(net, "dev");
4077}
4078
022cbae6 4079static struct pernet_operations __net_initdata dev_proc_ops = {
881d966b
EB
4080 .init = dev_proc_net_init,
4081 .exit = dev_proc_net_exit,
4082};
4083
4084static int __init dev_proc_init(void)
4085{
4086 return register_pernet_subsys(&dev_proc_ops);
4087}
1da177e4
LT
4088#else
4089#define dev_proc_init() 0
4090#endif /* CONFIG_PROC_FS */
4091
4092
4093/**
4094 * netdev_set_master - set up master/slave pair
4095 * @slave: slave device
4096 * @master: new master device
4097 *
4098 * Changes the master device of the slave. Pass %NULL to break the
4099 * bonding. The caller must hold the RTNL semaphore. On a failure
4100 * a negative errno code is returned. On success the reference counts
4101 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4102 * function returns zero.
4103 */
4104int netdev_set_master(struct net_device *slave, struct net_device *master)
4105{
4106 struct net_device *old = slave->master;
4107
4108 ASSERT_RTNL();
4109
4110 if (master) {
4111 if (old)
4112 return -EBUSY;
4113 dev_hold(master);
4114 }
4115
4116 slave->master = master;
4ec93edb 4117
283f2fe8
ED
4118 if (old) {
4119 synchronize_net();
1da177e4 4120 dev_put(old);
283f2fe8 4121 }
1da177e4
LT
4122 if (master)
4123 slave->flags |= IFF_SLAVE;
4124 else
4125 slave->flags &= ~IFF_SLAVE;
4126
4127 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4128 return 0;
4129}
d1b19dff 4130EXPORT_SYMBOL(netdev_set_master);
1da177e4 4131
b6c40d68
PM
4132static void dev_change_rx_flags(struct net_device *dev, int flags)
4133{
d314774c
SH
4134 const struct net_device_ops *ops = dev->netdev_ops;
4135
4136 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4137 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
4138}
4139
dad9b335 4140static int __dev_set_promiscuity(struct net_device *dev, int inc)
1da177e4
LT
4141{
4142 unsigned short old_flags = dev->flags;
8192b0c4
DH
4143 uid_t uid;
4144 gid_t gid;
1da177e4 4145
24023451
PM
4146 ASSERT_RTNL();
4147
dad9b335
WC
4148 dev->flags |= IFF_PROMISC;
4149 dev->promiscuity += inc;
4150 if (dev->promiscuity == 0) {
4151 /*
4152 * Avoid overflow.
4153 * If inc causes overflow, untouch promisc and return error.
4154 */
4155 if (inc < 0)
4156 dev->flags &= ~IFF_PROMISC;
4157 else {
4158 dev->promiscuity -= inc;
4159 printk(KERN_WARNING "%s: promiscuity touches roof, "
4160 "set promiscuity failed, promiscuity feature "
4161 "of device might be broken.\n", dev->name);
4162 return -EOVERFLOW;
4163 }
4164 }
52609c0b 4165 if (dev->flags != old_flags) {
1da177e4
LT
4166 printk(KERN_INFO "device %s %s promiscuous mode\n",
4167 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4ec93edb 4168 "left");
8192b0c4
DH
4169 if (audit_enabled) {
4170 current_uid_gid(&uid, &gid);
7759db82
KHK
4171 audit_log(current->audit_context, GFP_ATOMIC,
4172 AUDIT_ANOM_PROMISCUOUS,
4173 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4174 dev->name, (dev->flags & IFF_PROMISC),
4175 (old_flags & IFF_PROMISC),
4176 audit_get_loginuid(current),
8192b0c4 4177 uid, gid,
7759db82 4178 audit_get_sessionid(current));
8192b0c4 4179 }
24023451 4180
b6c40d68 4181 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 4182 }
dad9b335 4183 return 0;
1da177e4
LT
4184}
4185
4417da66
PM
4186/**
4187 * dev_set_promiscuity - update promiscuity count on a device
4188 * @dev: device
4189 * @inc: modifier
4190 *
4191 * Add or remove promiscuity from a device. While the count in the device
4192 * remains above zero the interface remains promiscuous. Once it hits zero
4193 * the device reverts back to normal filtering operation. A negative inc
4194 * value is used to drop promiscuity on the device.
dad9b335 4195 * Return 0 if successful or a negative errno code on error.
4417da66 4196 */
dad9b335 4197int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66
PM
4198{
4199 unsigned short old_flags = dev->flags;
dad9b335 4200 int err;
4417da66 4201
dad9b335 4202 err = __dev_set_promiscuity(dev, inc);
4b5a698e 4203 if (err < 0)
dad9b335 4204 return err;
4417da66
PM
4205 if (dev->flags != old_flags)
4206 dev_set_rx_mode(dev);
dad9b335 4207 return err;
4417da66 4208}
d1b19dff 4209EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 4210
1da177e4
LT
4211/**
4212 * dev_set_allmulti - update allmulti count on a device
4213 * @dev: device
4214 * @inc: modifier
4215 *
4216 * Add or remove reception of all multicast frames to a device. While the
4217 * count in the device remains above zero the interface remains listening
4218 * to all interfaces. Once it hits zero the device reverts back to normal
4219 * filtering operation. A negative @inc value is used to drop the counter
4220 * when releasing a resource needing all multicasts.
dad9b335 4221 * Return 0 if successful or a negative errno code on error.
1da177e4
LT
4222 */
4223
dad9b335 4224int dev_set_allmulti(struct net_device *dev, int inc)
1da177e4
LT
4225{
4226 unsigned short old_flags = dev->flags;
4227
24023451
PM
4228 ASSERT_RTNL();
4229
1da177e4 4230 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
4231 dev->allmulti += inc;
4232 if (dev->allmulti == 0) {
4233 /*
4234 * Avoid overflow.
4235 * If inc causes overflow, untouch allmulti and return error.
4236 */
4237 if (inc < 0)
4238 dev->flags &= ~IFF_ALLMULTI;
4239 else {
4240 dev->allmulti -= inc;
4241 printk(KERN_WARNING "%s: allmulti touches roof, "
4242 "set allmulti failed, allmulti feature of "
4243 "device might be broken.\n", dev->name);
4244 return -EOVERFLOW;
4245 }
4246 }
24023451 4247 if (dev->flags ^ old_flags) {
b6c40d68 4248 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 4249 dev_set_rx_mode(dev);
24023451 4250 }
dad9b335 4251 return 0;
4417da66 4252}
d1b19dff 4253EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
4254
4255/*
4256 * Upload unicast and multicast address lists to device and
4257 * configure RX filtering. When the device doesn't support unicast
53ccaae1 4258 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
4259 * are present.
4260 */
4261void __dev_set_rx_mode(struct net_device *dev)
4262{
d314774c
SH
4263 const struct net_device_ops *ops = dev->netdev_ops;
4264
4417da66
PM
4265 /* dev_open will call this function so the list will stay sane. */
4266 if (!(dev->flags&IFF_UP))
4267 return;
4268
4269 if (!netif_device_present(dev))
40b77c94 4270 return;
4417da66 4271
d314774c
SH
4272 if (ops->ndo_set_rx_mode)
4273 ops->ndo_set_rx_mode(dev);
4417da66
PM
4274 else {
4275 /* Unicast addresses changes may only happen under the rtnl,
4276 * therefore calling __dev_set_promiscuity here is safe.
4277 */
32e7bfc4 4278 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4417da66
PM
4279 __dev_set_promiscuity(dev, 1);
4280 dev->uc_promisc = 1;
32e7bfc4 4281 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4417da66
PM
4282 __dev_set_promiscuity(dev, -1);
4283 dev->uc_promisc = 0;
4284 }
4285
d314774c
SH
4286 if (ops->ndo_set_multicast_list)
4287 ops->ndo_set_multicast_list(dev);
4417da66
PM
4288 }
4289}
4290
4291void dev_set_rx_mode(struct net_device *dev)
4292{
b9e40857 4293 netif_addr_lock_bh(dev);
4417da66 4294 __dev_set_rx_mode(dev);
b9e40857 4295 netif_addr_unlock_bh(dev);
1da177e4
LT
4296}
4297
f0db275a
SH
4298/**
4299 * dev_get_flags - get flags reported to userspace
4300 * @dev: device
4301 *
4302 * Get the combination of flag bits exported through APIs to userspace.
4303 */
1da177e4
LT
4304unsigned dev_get_flags(const struct net_device *dev)
4305{
4306 unsigned flags;
4307
4308 flags = (dev->flags & ~(IFF_PROMISC |
4309 IFF_ALLMULTI |
b00055aa
SR
4310 IFF_RUNNING |
4311 IFF_LOWER_UP |
4312 IFF_DORMANT)) |
1da177e4
LT
4313 (dev->gflags & (IFF_PROMISC |
4314 IFF_ALLMULTI));
4315
b00055aa
SR
4316 if (netif_running(dev)) {
4317 if (netif_oper_up(dev))
4318 flags |= IFF_RUNNING;
4319 if (netif_carrier_ok(dev))
4320 flags |= IFF_LOWER_UP;
4321 if (netif_dormant(dev))
4322 flags |= IFF_DORMANT;
4323 }
1da177e4
LT
4324
4325 return flags;
4326}
d1b19dff 4327EXPORT_SYMBOL(dev_get_flags);
1da177e4 4328
bd380811 4329int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 4330{
1da177e4 4331 int old_flags = dev->flags;
bd380811 4332 int ret;
1da177e4 4333
24023451
PM
4334 ASSERT_RTNL();
4335
1da177e4
LT
4336 /*
4337 * Set the flags on our device.
4338 */
4339
4340 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4341 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4342 IFF_AUTOMEDIA)) |
4343 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4344 IFF_ALLMULTI));
4345
4346 /*
4347 * Load in the correct multicast list now the flags have changed.
4348 */
4349
b6c40d68
PM
4350 if ((old_flags ^ flags) & IFF_MULTICAST)
4351 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 4352
4417da66 4353 dev_set_rx_mode(dev);
1da177e4
LT
4354
4355 /*
4356 * Have we downed the interface. We handle IFF_UP ourselves
4357 * according to user attempts to set it, rather than blindly
4358 * setting it.
4359 */
4360
4361 ret = 0;
4362 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
bd380811 4363 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4
LT
4364
4365 if (!ret)
4417da66 4366 dev_set_rx_mode(dev);
1da177e4
LT
4367 }
4368
1da177e4 4369 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff
ED
4370 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4371
1da177e4
LT
4372 dev->gflags ^= IFF_PROMISC;
4373 dev_set_promiscuity(dev, inc);
4374 }
4375
4376 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4377 is important. Some (broken) drivers set IFF_PROMISC, when
4378 IFF_ALLMULTI is requested not asking us and not reporting.
4379 */
4380 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
4381 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4382
1da177e4
LT
4383 dev->gflags ^= IFF_ALLMULTI;
4384 dev_set_allmulti(dev, inc);
4385 }
4386
bd380811
PM
4387 return ret;
4388}
4389
4390void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4391{
4392 unsigned int changes = dev->flags ^ old_flags;
4393
4394 if (changes & IFF_UP) {
4395 if (dev->flags & IFF_UP)
4396 call_netdevice_notifiers(NETDEV_UP, dev);
4397 else
4398 call_netdevice_notifiers(NETDEV_DOWN, dev);
4399 }
4400
4401 if (dev->flags & IFF_UP &&
4402 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4403 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4404}
4405
4406/**
4407 * dev_change_flags - change device settings
4408 * @dev: device
4409 * @flags: device state flags
4410 *
4411 * Change settings on device based state flags. The flags are
4412 * in the userspace exported format.
4413 */
4414int dev_change_flags(struct net_device *dev, unsigned flags)
4415{
4416 int ret, changes;
4417 int old_flags = dev->flags;
4418
4419 ret = __dev_change_flags(dev, flags);
4420 if (ret < 0)
4421 return ret;
4422
4423 changes = old_flags ^ dev->flags;
7c355f53
TG
4424 if (changes)
4425 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
1da177e4 4426
bd380811 4427 __dev_notify_flags(dev, old_flags);
1da177e4
LT
4428 return ret;
4429}
d1b19dff 4430EXPORT_SYMBOL(dev_change_flags);
1da177e4 4431
f0db275a
SH
4432/**
4433 * dev_set_mtu - Change maximum transfer unit
4434 * @dev: device
4435 * @new_mtu: new transfer unit
4436 *
4437 * Change the maximum transfer size of the network device.
4438 */
1da177e4
LT
4439int dev_set_mtu(struct net_device *dev, int new_mtu)
4440{
d314774c 4441 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4442 int err;
4443
4444 if (new_mtu == dev->mtu)
4445 return 0;
4446
4447 /* MTU must be positive. */
4448 if (new_mtu < 0)
4449 return -EINVAL;
4450
4451 if (!netif_device_present(dev))
4452 return -ENODEV;
4453
4454 err = 0;
d314774c
SH
4455 if (ops->ndo_change_mtu)
4456 err = ops->ndo_change_mtu(dev, new_mtu);
1da177e4
LT
4457 else
4458 dev->mtu = new_mtu;
d314774c 4459
1da177e4 4460 if (!err && dev->flags & IFF_UP)
056925ab 4461 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
1da177e4
LT
4462 return err;
4463}
d1b19dff 4464EXPORT_SYMBOL(dev_set_mtu);
1da177e4 4465
f0db275a
SH
4466/**
4467 * dev_set_mac_address - Change Media Access Control Address
4468 * @dev: device
4469 * @sa: new address
4470 *
4471 * Change the hardware (MAC) address of the device
4472 */
1da177e4
LT
4473int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4474{
d314774c 4475 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4476 int err;
4477
d314774c 4478 if (!ops->ndo_set_mac_address)
1da177e4
LT
4479 return -EOPNOTSUPP;
4480 if (sa->sa_family != dev->type)
4481 return -EINVAL;
4482 if (!netif_device_present(dev))
4483 return -ENODEV;
d314774c 4484 err = ops->ndo_set_mac_address(dev, sa);
1da177e4 4485 if (!err)
056925ab 4486 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
1da177e4
LT
4487 return err;
4488}
d1b19dff 4489EXPORT_SYMBOL(dev_set_mac_address);
1da177e4
LT
4490
4491/*
3710becf 4492 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
1da177e4 4493 */
14e3e079 4494static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
1da177e4
LT
4495{
4496 int err;
3710becf 4497 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
1da177e4
LT
4498
4499 if (!dev)
4500 return -ENODEV;
4501
4502 switch (cmd) {
d1b19dff
ED
4503 case SIOCGIFFLAGS: /* Get interface flags */
4504 ifr->ifr_flags = (short) dev_get_flags(dev);
4505 return 0;
1da177e4 4506
d1b19dff
ED
4507 case SIOCGIFMETRIC: /* Get the metric on the interface
4508 (currently unused) */
4509 ifr->ifr_metric = 0;
4510 return 0;
1da177e4 4511
d1b19dff
ED
4512 case SIOCGIFMTU: /* Get the MTU of a device */
4513 ifr->ifr_mtu = dev->mtu;
4514 return 0;
1da177e4 4515
d1b19dff
ED
4516 case SIOCGIFHWADDR:
4517 if (!dev->addr_len)
4518 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4519 else
4520 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4521 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4522 ifr->ifr_hwaddr.sa_family = dev->type;
4523 return 0;
1da177e4 4524
d1b19dff
ED
4525 case SIOCGIFSLAVE:
4526 err = -EINVAL;
4527 break;
14e3e079 4528
d1b19dff
ED
4529 case SIOCGIFMAP:
4530 ifr->ifr_map.mem_start = dev->mem_start;
4531 ifr->ifr_map.mem_end = dev->mem_end;
4532 ifr->ifr_map.base_addr = dev->base_addr;
4533 ifr->ifr_map.irq = dev->irq;
4534 ifr->ifr_map.dma = dev->dma;
4535 ifr->ifr_map.port = dev->if_port;
4536 return 0;
14e3e079 4537
d1b19dff
ED
4538 case SIOCGIFINDEX:
4539 ifr->ifr_ifindex = dev->ifindex;
4540 return 0;
14e3e079 4541
d1b19dff
ED
4542 case SIOCGIFTXQLEN:
4543 ifr->ifr_qlen = dev->tx_queue_len;
4544 return 0;
14e3e079 4545
d1b19dff
ED
4546 default:
4547 /* dev_ioctl() should ensure this case
4548 * is never reached
4549 */
4550 WARN_ON(1);
4551 err = -EINVAL;
4552 break;
14e3e079
JG
4553
4554 }
4555 return err;
4556}
4557
4558/*
4559 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4560 */
4561static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4562{
4563 int err;
4564 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5f2f6da7 4565 const struct net_device_ops *ops;
14e3e079
JG
4566
4567 if (!dev)
4568 return -ENODEV;
4569
5f2f6da7
JP
4570 ops = dev->netdev_ops;
4571
14e3e079 4572 switch (cmd) {
d1b19dff
ED
4573 case SIOCSIFFLAGS: /* Set interface flags */
4574 return dev_change_flags(dev, ifr->ifr_flags);
14e3e079 4575
d1b19dff
ED
4576 case SIOCSIFMETRIC: /* Set the metric on the interface
4577 (currently unused) */
4578 return -EOPNOTSUPP;
14e3e079 4579
d1b19dff
ED
4580 case SIOCSIFMTU: /* Set the MTU of a device */
4581 return dev_set_mtu(dev, ifr->ifr_mtu);
1da177e4 4582
d1b19dff
ED
4583 case SIOCSIFHWADDR:
4584 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
1da177e4 4585
d1b19dff
ED
4586 case SIOCSIFHWBROADCAST:
4587 if (ifr->ifr_hwaddr.sa_family != dev->type)
4588 return -EINVAL;
4589 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4590 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4591 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4592 return 0;
1da177e4 4593
d1b19dff
ED
4594 case SIOCSIFMAP:
4595 if (ops->ndo_set_config) {
1da177e4
LT
4596 if (!netif_device_present(dev))
4597 return -ENODEV;
d1b19dff
ED
4598 return ops->ndo_set_config(dev, &ifr->ifr_map);
4599 }
4600 return -EOPNOTSUPP;
1da177e4 4601
d1b19dff
ED
4602 case SIOCADDMULTI:
4603 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4604 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4605 return -EINVAL;
4606 if (!netif_device_present(dev))
4607 return -ENODEV;
22bedad3 4608 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
d1b19dff
ED
4609
4610 case SIOCDELMULTI:
4611 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4612 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4613 return -EINVAL;
4614 if (!netif_device_present(dev))
4615 return -ENODEV;
22bedad3 4616 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
1da177e4 4617
d1b19dff
ED
4618 case SIOCSIFTXQLEN:
4619 if (ifr->ifr_qlen < 0)
4620 return -EINVAL;
4621 dev->tx_queue_len = ifr->ifr_qlen;
4622 return 0;
1da177e4 4623
d1b19dff
ED
4624 case SIOCSIFNAME:
4625 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4626 return dev_change_name(dev, ifr->ifr_newname);
1da177e4 4627
d1b19dff
ED
4628 /*
4629 * Unknown or private ioctl
4630 */
4631 default:
4632 if ((cmd >= SIOCDEVPRIVATE &&
4633 cmd <= SIOCDEVPRIVATE + 15) ||
4634 cmd == SIOCBONDENSLAVE ||
4635 cmd == SIOCBONDRELEASE ||
4636 cmd == SIOCBONDSETHWADDR ||
4637 cmd == SIOCBONDSLAVEINFOQUERY ||
4638 cmd == SIOCBONDINFOQUERY ||
4639 cmd == SIOCBONDCHANGEACTIVE ||
4640 cmd == SIOCGMIIPHY ||
4641 cmd == SIOCGMIIREG ||
4642 cmd == SIOCSMIIREG ||
4643 cmd == SIOCBRADDIF ||
4644 cmd == SIOCBRDELIF ||
4645 cmd == SIOCSHWTSTAMP ||
4646 cmd == SIOCWANDEV) {
4647 err = -EOPNOTSUPP;
4648 if (ops->ndo_do_ioctl) {
4649 if (netif_device_present(dev))
4650 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4651 else
4652 err = -ENODEV;
4653 }
4654 } else
4655 err = -EINVAL;
1da177e4
LT
4656
4657 }
4658 return err;
4659}
4660
4661/*
4662 * This function handles all "interface"-type I/O control requests. The actual
4663 * 'doing' part of this is dev_ifsioc above.
4664 */
4665
4666/**
4667 * dev_ioctl - network device ioctl
c4ea43c5 4668 * @net: the applicable net namespace
1da177e4
LT
4669 * @cmd: command to issue
4670 * @arg: pointer to a struct ifreq in user space
4671 *
4672 * Issue ioctl functions to devices. This is normally called by the
4673 * user space syscall interfaces but can sometimes be useful for
4674 * other purposes. The return value is the return from the syscall if
4675 * positive or a negative errno code on error.
4676 */
4677
881d966b 4678int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1da177e4
LT
4679{
4680 struct ifreq ifr;
4681 int ret;
4682 char *colon;
4683
4684 /* One special case: SIOCGIFCONF takes ifconf argument
4685 and requires shared lock, because it sleeps writing
4686 to user space.
4687 */
4688
4689 if (cmd == SIOCGIFCONF) {
6756ae4b 4690 rtnl_lock();
881d966b 4691 ret = dev_ifconf(net, (char __user *) arg);
6756ae4b 4692 rtnl_unlock();
1da177e4
LT
4693 return ret;
4694 }
4695 if (cmd == SIOCGIFNAME)
881d966b 4696 return dev_ifname(net, (struct ifreq __user *)arg);
1da177e4
LT
4697
4698 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4699 return -EFAULT;
4700
4701 ifr.ifr_name[IFNAMSIZ-1] = 0;
4702
4703 colon = strchr(ifr.ifr_name, ':');
4704 if (colon)
4705 *colon = 0;
4706
4707 /*
4708 * See which interface the caller is talking about.
4709 */
4710
4711 switch (cmd) {
d1b19dff
ED
4712 /*
4713 * These ioctl calls:
4714 * - can be done by all.
4715 * - atomic and do not require locking.
4716 * - return a value
4717 */
4718 case SIOCGIFFLAGS:
4719 case SIOCGIFMETRIC:
4720 case SIOCGIFMTU:
4721 case SIOCGIFHWADDR:
4722 case SIOCGIFSLAVE:
4723 case SIOCGIFMAP:
4724 case SIOCGIFINDEX:
4725 case SIOCGIFTXQLEN:
4726 dev_load(net, ifr.ifr_name);
3710becf 4727 rcu_read_lock();
d1b19dff 4728 ret = dev_ifsioc_locked(net, &ifr, cmd);
3710becf 4729 rcu_read_unlock();
d1b19dff
ED
4730 if (!ret) {
4731 if (colon)
4732 *colon = ':';
4733 if (copy_to_user(arg, &ifr,
4734 sizeof(struct ifreq)))
4735 ret = -EFAULT;
4736 }
4737 return ret;
1da177e4 4738
d1b19dff
ED
4739 case SIOCETHTOOL:
4740 dev_load(net, ifr.ifr_name);
4741 rtnl_lock();
4742 ret = dev_ethtool(net, &ifr);
4743 rtnl_unlock();
4744 if (!ret) {
4745 if (colon)
4746 *colon = ':';
4747 if (copy_to_user(arg, &ifr,
4748 sizeof(struct ifreq)))
4749 ret = -EFAULT;
4750 }
4751 return ret;
1da177e4 4752
d1b19dff
ED
4753 /*
4754 * These ioctl calls:
4755 * - require superuser power.
4756 * - require strict serialization.
4757 * - return a value
4758 */
4759 case SIOCGMIIPHY:
4760 case SIOCGMIIREG:
4761 case SIOCSIFNAME:
4762 if (!capable(CAP_NET_ADMIN))
4763 return -EPERM;
4764 dev_load(net, ifr.ifr_name);
4765 rtnl_lock();
4766 ret = dev_ifsioc(net, &ifr, cmd);
4767 rtnl_unlock();
4768 if (!ret) {
4769 if (colon)
4770 *colon = ':';
4771 if (copy_to_user(arg, &ifr,
4772 sizeof(struct ifreq)))
4773 ret = -EFAULT;
4774 }
4775 return ret;
1da177e4 4776
d1b19dff
ED
4777 /*
4778 * These ioctl calls:
4779 * - require superuser power.
4780 * - require strict serialization.
4781 * - do not return a value
4782 */
4783 case SIOCSIFFLAGS:
4784 case SIOCSIFMETRIC:
4785 case SIOCSIFMTU:
4786 case SIOCSIFMAP:
4787 case SIOCSIFHWADDR:
4788 case SIOCSIFSLAVE:
4789 case SIOCADDMULTI:
4790 case SIOCDELMULTI:
4791 case SIOCSIFHWBROADCAST:
4792 case SIOCSIFTXQLEN:
4793 case SIOCSMIIREG:
4794 case SIOCBONDENSLAVE:
4795 case SIOCBONDRELEASE:
4796 case SIOCBONDSETHWADDR:
4797 case SIOCBONDCHANGEACTIVE:
4798 case SIOCBRADDIF:
4799 case SIOCBRDELIF:
4800 case SIOCSHWTSTAMP:
4801 if (!capable(CAP_NET_ADMIN))
4802 return -EPERM;
4803 /* fall through */
4804 case SIOCBONDSLAVEINFOQUERY:
4805 case SIOCBONDINFOQUERY:
4806 dev_load(net, ifr.ifr_name);
4807 rtnl_lock();
4808 ret = dev_ifsioc(net, &ifr, cmd);
4809 rtnl_unlock();
4810 return ret;
4811
4812 case SIOCGIFMEM:
4813 /* Get the per device memory space. We can add this but
4814 * currently do not support it */
4815 case SIOCSIFMEM:
4816 /* Set the per device memory buffer space.
4817 * Not applicable in our case */
4818 case SIOCSIFLINK:
4819 return -EINVAL;
4820
4821 /*
4822 * Unknown or private ioctl.
4823 */
4824 default:
4825 if (cmd == SIOCWANDEV ||
4826 (cmd >= SIOCDEVPRIVATE &&
4827 cmd <= SIOCDEVPRIVATE + 15)) {
881d966b 4828 dev_load(net, ifr.ifr_name);
1da177e4 4829 rtnl_lock();
881d966b 4830 ret = dev_ifsioc(net, &ifr, cmd);
1da177e4 4831 rtnl_unlock();
d1b19dff
ED
4832 if (!ret && copy_to_user(arg, &ifr,
4833 sizeof(struct ifreq)))
4834 ret = -EFAULT;
1da177e4 4835 return ret;
d1b19dff
ED
4836 }
4837 /* Take care of Wireless Extensions */
4838 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4839 return wext_handle_ioctl(net, &ifr, cmd, arg);
4840 return -EINVAL;
1da177e4
LT
4841 }
4842}
4843
4844
4845/**
4846 * dev_new_index - allocate an ifindex
c4ea43c5 4847 * @net: the applicable net namespace
1da177e4
LT
4848 *
4849 * Returns a suitable unique value for a new device interface
4850 * number. The caller must hold the rtnl semaphore or the
4851 * dev_base_lock to be sure it remains unique.
4852 */
881d966b 4853static int dev_new_index(struct net *net)
1da177e4
LT
4854{
4855 static int ifindex;
4856 for (;;) {
4857 if (++ifindex <= 0)
4858 ifindex = 1;
881d966b 4859 if (!__dev_get_by_index(net, ifindex))
1da177e4
LT
4860 return ifindex;
4861 }
4862}
4863
1da177e4 4864/* Delayed registration/unregisteration */
3b5b34fd 4865static LIST_HEAD(net_todo_list);
1da177e4 4866
6f05f629 4867static void net_set_todo(struct net_device *dev)
1da177e4 4868{
1da177e4 4869 list_add_tail(&dev->todo_list, &net_todo_list);
1da177e4
LT
4870}
4871
9b5e383c 4872static void rollback_registered_many(struct list_head *head)
93ee31f1 4873{
e93737b0 4874 struct net_device *dev, *tmp;
9b5e383c 4875
93ee31f1
DL
4876 BUG_ON(dev_boot_phase);
4877 ASSERT_RTNL();
4878
e93737b0 4879 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 4880 /* Some devices call without registering
e93737b0
KK
4881 * for initialization unwind. Remove those
4882 * devices and proceed with the remaining.
9b5e383c
ED
4883 */
4884 if (dev->reg_state == NETREG_UNINITIALIZED) {
4885 pr_debug("unregister_netdevice: device %s/%p never "
4886 "was registered\n", dev->name, dev);
93ee31f1 4887
9b5e383c 4888 WARN_ON(1);
e93737b0
KK
4889 list_del(&dev->unreg_list);
4890 continue;
9b5e383c 4891 }
93ee31f1 4892
9b5e383c 4893 BUG_ON(dev->reg_state != NETREG_REGISTERED);
93ee31f1 4894
9b5e383c
ED
4895 /* If device is running, close it first. */
4896 dev_close(dev);
93ee31f1 4897
9b5e383c
ED
4898 /* And unlink it from device chain. */
4899 unlist_netdevice(dev);
93ee31f1 4900
9b5e383c
ED
4901 dev->reg_state = NETREG_UNREGISTERING;
4902 }
93ee31f1
DL
4903
4904 synchronize_net();
4905
9b5e383c
ED
4906 list_for_each_entry(dev, head, unreg_list) {
4907 /* Shutdown queueing discipline. */
4908 dev_shutdown(dev);
93ee31f1
DL
4909
4910
9b5e383c
ED
4911 /* Notify protocols, that we are about to destroy
4912 this device. They should clean all the things.
4913 */
4914 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 4915
a2835763
PM
4916 if (!dev->rtnl_link_ops ||
4917 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4918 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4919
9b5e383c
ED
4920 /*
4921 * Flush the unicast and multicast chains
4922 */
a748ee24 4923 dev_uc_flush(dev);
22bedad3 4924 dev_mc_flush(dev);
93ee31f1 4925
9b5e383c
ED
4926 if (dev->netdev_ops->ndo_uninit)
4927 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 4928
9b5e383c
ED
4929 /* Notifier chain MUST detach us from master device. */
4930 WARN_ON(dev->master);
93ee31f1 4931
9b5e383c
ED
4932 /* Remove entries from kobject tree */
4933 netdev_unregister_kobject(dev);
4934 }
93ee31f1 4935
a5ee1551 4936 /* Process any work delayed until the end of the batch */
e5e26d75 4937 dev = list_first_entry(head, struct net_device, unreg_list);
a5ee1551 4938 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
93ee31f1 4939
ef885afb 4940 rcu_barrier();
395264d5 4941
a5ee1551 4942 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
4943 dev_put(dev);
4944}
4945
4946static void rollback_registered(struct net_device *dev)
4947{
4948 LIST_HEAD(single);
4949
4950 list_add(&dev->unreg_list, &single);
4951 rollback_registered_many(&single);
93ee31f1
DL
4952}
4953
b63365a2
HX
4954unsigned long netdev_fix_features(unsigned long features, const char *name)
4955{
4956 /* Fix illegal SG+CSUM combinations. */
4957 if ((features & NETIF_F_SG) &&
4958 !(features & NETIF_F_ALL_CSUM)) {
4959 if (name)
4960 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4961 "checksum feature.\n", name);
4962 features &= ~NETIF_F_SG;
4963 }
4964
4965 /* TSO requires that SG is present as well. */
4966 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4967 if (name)
4968 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4969 "SG feature.\n", name);
4970 features &= ~NETIF_F_TSO;
4971 }
4972
4973 if (features & NETIF_F_UFO) {
4974 if (!(features & NETIF_F_GEN_CSUM)) {
4975 if (name)
4976 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4977 "since no NETIF_F_HW_CSUM feature.\n",
4978 name);
4979 features &= ~NETIF_F_UFO;
4980 }
4981
4982 if (!(features & NETIF_F_SG)) {
4983 if (name)
4984 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4985 "since no NETIF_F_SG feature.\n", name);
4986 features &= ~NETIF_F_UFO;
4987 }
4988 }
4989
4990 return features;
4991}
4992EXPORT_SYMBOL(netdev_fix_features);
4993
fc4a7489
PM
4994/**
4995 * netif_stacked_transfer_operstate - transfer operstate
4996 * @rootdev: the root or lower level device to transfer state from
4997 * @dev: the device to transfer operstate to
4998 *
4999 * Transfer operational state from root to device. This is normally
5000 * called when a stacking relationship exists between the root
5001 * device and the device(a leaf device).
5002 */
5003void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5004 struct net_device *dev)
5005{
5006 if (rootdev->operstate == IF_OPER_DORMANT)
5007 netif_dormant_on(dev);
5008 else
5009 netif_dormant_off(dev);
5010
5011 if (netif_carrier_ok(rootdev)) {
5012 if (!netif_carrier_ok(dev))
5013 netif_carrier_on(dev);
5014 } else {
5015 if (netif_carrier_ok(dev))
5016 netif_carrier_off(dev);
5017 }
5018}
5019EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5020
1b4bf461
ED
5021static int netif_alloc_rx_queues(struct net_device *dev)
5022{
5023#ifdef CONFIG_RPS
5024 unsigned int i, count = dev->num_rx_queues;
bd25fa7b 5025 struct netdev_rx_queue *rx;
1b4bf461 5026
bd25fa7b 5027 BUG_ON(count < 1);
1b4bf461 5028
bd25fa7b
TH
5029 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5030 if (!rx) {
5031 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5032 return -ENOMEM;
1b4bf461 5033 }
bd25fa7b
TH
5034 dev->_rx = rx;
5035
5036 /*
5037 * Set a pointer to first element in the array which holds the
5038 * reference count.
5039 */
5040 for (i = 0; i < count; i++)
5041 rx[i].first = rx;
1b4bf461
ED
5042#endif
5043 return 0;
5044}
5045
e6484930
TH
5046static int netif_alloc_netdev_queues(struct net_device *dev)
5047{
5048 unsigned int count = dev->num_tx_queues;
5049 struct netdev_queue *tx;
5050
5051 BUG_ON(count < 1);
5052
5053 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5054 if (!tx) {
5055 pr_err("netdev: Unable to allocate %u tx queues.\n",
5056 count);
5057 return -ENOMEM;
5058 }
5059 dev->_tx = tx;
5060 return 0;
5061}
5062
5063static void netdev_init_one_queue(struct net_device *dev,
5064 struct netdev_queue *queue,
5065 void *_unused)
5066{
5067 queue->dev = dev;
5068
5069 /* Initialize queue lock */
5070 spin_lock_init(&queue->_xmit_lock);
5071 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5072 queue->xmit_lock_owner = -1;
5073}
5074
5075static void netdev_init_queues(struct net_device *dev)
5076{
5077 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5078 spin_lock_init(&dev->tx_global_lock);
5079}
5080
1da177e4
LT
5081/**
5082 * register_netdevice - register a network device
5083 * @dev: device to register
5084 *
5085 * Take a completed network device structure and add it to the kernel
5086 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5087 * chain. 0 is returned on success. A negative errno code is returned
5088 * on a failure to set up the device, or if the name is a duplicate.
5089 *
5090 * Callers must hold the rtnl semaphore. You may want
5091 * register_netdev() instead of this.
5092 *
5093 * BUGS:
5094 * The locking appears insufficient to guarantee two parallel registers
5095 * will not get the same name.
5096 */
5097
5098int register_netdevice(struct net_device *dev)
5099{
1da177e4 5100 int ret;
d314774c 5101 struct net *net = dev_net(dev);
1da177e4
LT
5102
5103 BUG_ON(dev_boot_phase);
5104 ASSERT_RTNL();
5105
b17a7c17
SH
5106 might_sleep();
5107
1da177e4
LT
5108 /* When net_device's are persistent, this will be fatal. */
5109 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 5110 BUG_ON(!net);
1da177e4 5111
f1f28aa3 5112 spin_lock_init(&dev->addr_list_lock);
cf508b12 5113 netdev_set_addr_lockdep_class(dev);
1da177e4 5114
1da177e4
LT
5115 dev->iflink = -1;
5116
1b4bf461
ED
5117 ret = netif_alloc_rx_queues(dev);
5118 if (ret)
5119 goto out;
0a9627f2 5120
e6484930
TH
5121 ret = netif_alloc_netdev_queues(dev);
5122 if (ret)
5123 goto out;
5124
5125 netdev_init_queues(dev);
0a9627f2 5126
1da177e4 5127 /* Init, if this function is available */
d314774c
SH
5128 if (dev->netdev_ops->ndo_init) {
5129 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
5130 if (ret) {
5131 if (ret > 0)
5132 ret = -EIO;
90833aa4 5133 goto out;
1da177e4
LT
5134 }
5135 }
4ec93edb 5136
8ce6cebc 5137 ret = dev_get_valid_name(dev, dev->name, 0);
d9031024 5138 if (ret)
7ce1b0ed 5139 goto err_uninit;
1da177e4 5140
881d966b 5141 dev->ifindex = dev_new_index(net);
1da177e4
LT
5142 if (dev->iflink == -1)
5143 dev->iflink = dev->ifindex;
5144
d212f87b
SH
5145 /* Fix illegal checksum combinations */
5146 if ((dev->features & NETIF_F_HW_CSUM) &&
5147 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5148 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5149 dev->name);
5150 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5151 }
5152
5153 if ((dev->features & NETIF_F_NO_CSUM) &&
5154 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5155 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5156 dev->name);
5157 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5158 }
5159
b63365a2 5160 dev->features = netdev_fix_features(dev->features, dev->name);
1da177e4 5161
e5a4a72d
LB
5162 /* Enable software GSO if SG is supported. */
5163 if (dev->features & NETIF_F_SG)
5164 dev->features |= NETIF_F_GSO;
5165
c5256c51
ED
5166 /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5167 * vlan_dev_init() will do the dev->features check, so these features
5168 * are enabled only if supported by underlying device.
16c3ea78 5169 */
c5256c51 5170 dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
16c3ea78 5171
7ffbe3fd
JB
5172 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5173 ret = notifier_to_errno(ret);
5174 if (ret)
5175 goto err_uninit;
5176
8b41d188 5177 ret = netdev_register_kobject(dev);
b17a7c17 5178 if (ret)
7ce1b0ed 5179 goto err_uninit;
b17a7c17
SH
5180 dev->reg_state = NETREG_REGISTERED;
5181
1da177e4
LT
5182 /*
5183 * Default initial state at registry is that the
5184 * device is present.
5185 */
5186
5187 set_bit(__LINK_STATE_PRESENT, &dev->state);
5188
1da177e4 5189 dev_init_scheduler(dev);
1da177e4 5190 dev_hold(dev);
ce286d32 5191 list_netdevice(dev);
1da177e4
LT
5192
5193 /* Notify protocols, that a new device appeared. */
056925ab 5194 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 5195 ret = notifier_to_errno(ret);
93ee31f1
DL
5196 if (ret) {
5197 rollback_registered(dev);
5198 dev->reg_state = NETREG_UNREGISTERED;
5199 }
d90a909e
EB
5200 /*
5201 * Prevent userspace races by waiting until the network
5202 * device is fully setup before sending notifications.
5203 */
a2835763
PM
5204 if (!dev->rtnl_link_ops ||
5205 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5206 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
1da177e4
LT
5207
5208out:
5209 return ret;
7ce1b0ed
HX
5210
5211err_uninit:
d314774c
SH
5212 if (dev->netdev_ops->ndo_uninit)
5213 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 5214 goto out;
1da177e4 5215}
d1b19dff 5216EXPORT_SYMBOL(register_netdevice);
1da177e4 5217
937f1ba5
BH
5218/**
5219 * init_dummy_netdev - init a dummy network device for NAPI
5220 * @dev: device to init
5221 *
5222 * This takes a network device structure and initialize the minimum
5223 * amount of fields so it can be used to schedule NAPI polls without
5224 * registering a full blown interface. This is to be used by drivers
5225 * that need to tie several hardware interfaces to a single NAPI
5226 * poll scheduler due to HW limitations.
5227 */
5228int init_dummy_netdev(struct net_device *dev)
5229{
5230 /* Clear everything. Note we don't initialize spinlocks
5231 * are they aren't supposed to be taken by any of the
5232 * NAPI code and this dummy netdev is supposed to be
5233 * only ever used for NAPI polls
5234 */
5235 memset(dev, 0, sizeof(struct net_device));
5236
5237 /* make sure we BUG if trying to hit standard
5238 * register/unregister code path
5239 */
5240 dev->reg_state = NETREG_DUMMY;
5241
937f1ba5
BH
5242 /* NAPI wants this */
5243 INIT_LIST_HEAD(&dev->napi_list);
5244
5245 /* a dummy interface is started by default */
5246 set_bit(__LINK_STATE_PRESENT, &dev->state);
5247 set_bit(__LINK_STATE_START, &dev->state);
5248
29b4433d
ED
5249 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5250 * because users of this 'device' dont need to change
5251 * its refcount.
5252 */
5253
937f1ba5
BH
5254 return 0;
5255}
5256EXPORT_SYMBOL_GPL(init_dummy_netdev);
5257
5258
1da177e4
LT
5259/**
5260 * register_netdev - register a network device
5261 * @dev: device to register
5262 *
5263 * Take a completed network device structure and add it to the kernel
5264 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5265 * chain. 0 is returned on success. A negative errno code is returned
5266 * on a failure to set up the device, or if the name is a duplicate.
5267 *
38b4da38 5268 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
5269 * and expands the device name if you passed a format string to
5270 * alloc_netdev.
5271 */
5272int register_netdev(struct net_device *dev)
5273{
5274 int err;
5275
5276 rtnl_lock();
5277
5278 /*
5279 * If the name is a format string the caller wants us to do a
5280 * name allocation.
5281 */
5282 if (strchr(dev->name, '%')) {
5283 err = dev_alloc_name(dev, dev->name);
5284 if (err < 0)
5285 goto out;
5286 }
4ec93edb 5287
1da177e4
LT
5288 err = register_netdevice(dev);
5289out:
5290 rtnl_unlock();
5291 return err;
5292}
5293EXPORT_SYMBOL(register_netdev);
5294
29b4433d
ED
5295int netdev_refcnt_read(const struct net_device *dev)
5296{
5297 int i, refcnt = 0;
5298
5299 for_each_possible_cpu(i)
5300 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5301 return refcnt;
5302}
5303EXPORT_SYMBOL(netdev_refcnt_read);
5304
1da177e4
LT
5305/*
5306 * netdev_wait_allrefs - wait until all references are gone.
5307 *
5308 * This is called when unregistering network devices.
5309 *
5310 * Any protocol or device that holds a reference should register
5311 * for netdevice notification, and cleanup and put back the
5312 * reference if they receive an UNREGISTER event.
5313 * We can get stuck here if buggy protocols don't correctly
4ec93edb 5314 * call dev_put.
1da177e4
LT
5315 */
5316static void netdev_wait_allrefs(struct net_device *dev)
5317{
5318 unsigned long rebroadcast_time, warning_time;
29b4433d 5319 int refcnt;
1da177e4 5320
e014debe
ED
5321 linkwatch_forget_dev(dev);
5322
1da177e4 5323 rebroadcast_time = warning_time = jiffies;
29b4433d
ED
5324 refcnt = netdev_refcnt_read(dev);
5325
5326 while (refcnt != 0) {
1da177e4 5327 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 5328 rtnl_lock();
1da177e4
LT
5329
5330 /* Rebroadcast unregister notification */
056925ab 5331 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 5332 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
395264d5 5333 * should have already handle it the first time */
1da177e4
LT
5334
5335 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5336 &dev->state)) {
5337 /* We must not have linkwatch events
5338 * pending on unregister. If this
5339 * happens, we simply run the queue
5340 * unscheduled, resulting in a noop
5341 * for this device.
5342 */
5343 linkwatch_run_queue();
5344 }
5345
6756ae4b 5346 __rtnl_unlock();
1da177e4
LT
5347
5348 rebroadcast_time = jiffies;
5349 }
5350
5351 msleep(250);
5352
29b4433d
ED
5353 refcnt = netdev_refcnt_read(dev);
5354
1da177e4
LT
5355 if (time_after(jiffies, warning_time + 10 * HZ)) {
5356 printk(KERN_EMERG "unregister_netdevice: "
5357 "waiting for %s to become free. Usage "
5358 "count = %d\n",
29b4433d 5359 dev->name, refcnt);
1da177e4
LT
5360 warning_time = jiffies;
5361 }
5362 }
5363}
5364
5365/* The sequence is:
5366 *
5367 * rtnl_lock();
5368 * ...
5369 * register_netdevice(x1);
5370 * register_netdevice(x2);
5371 * ...
5372 * unregister_netdevice(y1);
5373 * unregister_netdevice(y2);
5374 * ...
5375 * rtnl_unlock();
5376 * free_netdev(y1);
5377 * free_netdev(y2);
5378 *
58ec3b4d 5379 * We are invoked by rtnl_unlock().
1da177e4 5380 * This allows us to deal with problems:
b17a7c17 5381 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
5382 * without deadlocking with linkwatch via keventd.
5383 * 2) Since we run with the RTNL semaphore not held, we can sleep
5384 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
5385 *
5386 * We must not return until all unregister events added during
5387 * the interval the lock was held have been completed.
1da177e4 5388 */
1da177e4
LT
5389void netdev_run_todo(void)
5390{
626ab0e6 5391 struct list_head list;
1da177e4 5392
1da177e4 5393 /* Snapshot list, allow later requests */
626ab0e6 5394 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
5395
5396 __rtnl_unlock();
626ab0e6 5397
1da177e4
LT
5398 while (!list_empty(&list)) {
5399 struct net_device *dev
e5e26d75 5400 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
5401 list_del(&dev->todo_list);
5402
b17a7c17
SH
5403 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5404 printk(KERN_ERR "network todo '%s' but state %d\n",
5405 dev->name, dev->reg_state);
5406 dump_stack();
5407 continue;
5408 }
1da177e4 5409
b17a7c17 5410 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 5411
152102c7 5412 on_each_cpu(flush_backlog, dev, 1);
6e583ce5 5413
b17a7c17 5414 netdev_wait_allrefs(dev);
1da177e4 5415
b17a7c17 5416 /* paranoia */
29b4433d 5417 BUG_ON(netdev_refcnt_read(dev));
95ae6b22 5418 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
547b792c
IJ
5419 WARN_ON(dev->ip6_ptr);
5420 WARN_ON(dev->dn_ptr);
1da177e4 5421
b17a7c17
SH
5422 if (dev->destructor)
5423 dev->destructor(dev);
9093bbb2
SH
5424
5425 /* Free network device */
5426 kobject_put(&dev->dev.kobj);
1da177e4 5427 }
1da177e4
LT
5428}
5429
d83345ad
ED
5430/**
5431 * dev_txq_stats_fold - fold tx_queues stats
5432 * @dev: device to get statistics from
3cfde79c 5433 * @stats: struct rtnl_link_stats64 to hold results
d83345ad
ED
5434 */
5435void dev_txq_stats_fold(const struct net_device *dev,
3cfde79c 5436 struct rtnl_link_stats64 *stats)
d83345ad 5437{
bd27290a 5438 u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
d83345ad
ED
5439 unsigned int i;
5440 struct netdev_queue *txq;
5441
5442 for (i = 0; i < dev->num_tx_queues; i++) {
5443 txq = netdev_get_tx_queue(dev, i);
bd27290a 5444 spin_lock_bh(&txq->_xmit_lock);
d83345ad
ED
5445 tx_bytes += txq->tx_bytes;
5446 tx_packets += txq->tx_packets;
5447 tx_dropped += txq->tx_dropped;
bd27290a 5448 spin_unlock_bh(&txq->_xmit_lock);
d83345ad
ED
5449 }
5450 if (tx_bytes || tx_packets || tx_dropped) {
5451 stats->tx_bytes = tx_bytes;
5452 stats->tx_packets = tx_packets;
5453 stats->tx_dropped = tx_dropped;
5454 }
5455}
5456EXPORT_SYMBOL(dev_txq_stats_fold);
5457
3cfde79c
BH
5458/* Convert net_device_stats to rtnl_link_stats64. They have the same
5459 * fields in the same order, with only the type differing.
5460 */
5461static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5462 const struct net_device_stats *netdev_stats)
5463{
5464#if BITS_PER_LONG == 64
5465 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5466 memcpy(stats64, netdev_stats, sizeof(*stats64));
5467#else
5468 size_t i, n = sizeof(*stats64) / sizeof(u64);
5469 const unsigned long *src = (const unsigned long *)netdev_stats;
5470 u64 *dst = (u64 *)stats64;
5471
5472 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5473 sizeof(*stats64) / sizeof(u64));
5474 for (i = 0; i < n; i++)
5475 dst[i] = src[i];
5476#endif
5477}
5478
eeda3fd6
SH
5479/**
5480 * dev_get_stats - get network device statistics
5481 * @dev: device to get statistics from
28172739 5482 * @storage: place to store stats
eeda3fd6 5483 *
d7753516
BH
5484 * Get network statistics from device. Return @storage.
5485 * The device driver may provide its own method by setting
5486 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5487 * otherwise the internal statistics structure is used.
eeda3fd6 5488 */
d7753516
BH
5489struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5490 struct rtnl_link_stats64 *storage)
7004bf25 5491{
eeda3fd6
SH
5492 const struct net_device_ops *ops = dev->netdev_ops;
5493
28172739
ED
5494 if (ops->ndo_get_stats64) {
5495 memset(storage, 0, sizeof(*storage));
caf586e5
ED
5496 ops->ndo_get_stats64(dev, storage);
5497 } else if (ops->ndo_get_stats) {
3cfde79c 5498 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
caf586e5
ED
5499 } else {
5500 netdev_stats_to_stats64(storage, &dev->stats);
5501 dev_txq_stats_fold(dev, storage);
28172739 5502 }
caf586e5 5503 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
28172739 5504 return storage;
c45d286e 5505}
eeda3fd6 5506EXPORT_SYMBOL(dev_get_stats);
c45d286e 5507
24824a09 5508struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
dc2b4847 5509{
24824a09 5510 struct netdev_queue *queue = dev_ingress_queue(dev);
dc2b4847 5511
24824a09
ED
5512#ifdef CONFIG_NET_CLS_ACT
5513 if (queue)
5514 return queue;
5515 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5516 if (!queue)
5517 return NULL;
5518 netdev_init_one_queue(dev, queue, NULL);
24824a09
ED
5519 queue->qdisc = &noop_qdisc;
5520 queue->qdisc_sleeping = &noop_qdisc;
5521 rcu_assign_pointer(dev->ingress_queue, queue);
5522#endif
5523 return queue;
bb949fbd
DM
5524}
5525
1da177e4 5526/**
f25f4e44 5527 * alloc_netdev_mq - allocate network device
1da177e4
LT
5528 * @sizeof_priv: size of private data to allocate space for
5529 * @name: device name format string
5530 * @setup: callback to initialize device
f25f4e44 5531 * @queue_count: the number of subqueues to allocate
1da177e4
LT
5532 *
5533 * Allocates a struct net_device with private data area for driver use
f25f4e44
PWJ
5534 * and performs basic initialization. Also allocates subquue structs
5535 * for each queue on the device at the end of the netdevice.
1da177e4 5536 */
f25f4e44
PWJ
5537struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5538 void (*setup)(struct net_device *), unsigned int queue_count)
1da177e4 5539{
1da177e4 5540 struct net_device *dev;
7943986c 5541 size_t alloc_size;
1ce8e7b5 5542 struct net_device *p;
1da177e4 5543
b6fe17d6
SH
5544 BUG_ON(strlen(name) >= sizeof(dev->name));
5545
55513fb4
TH
5546 if (queue_count < 1) {
5547 pr_err("alloc_netdev: Unable to allocate device "
5548 "with zero queues.\n");
5549 return NULL;
5550 }
5551
fd2ea0a7 5552 alloc_size = sizeof(struct net_device);
d1643d24
AD
5553 if (sizeof_priv) {
5554 /* ensure 32-byte alignment of private area */
1ce8e7b5 5555 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
5556 alloc_size += sizeof_priv;
5557 }
5558 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 5559 alloc_size += NETDEV_ALIGN - 1;
1da177e4 5560
31380de9 5561 p = kzalloc(alloc_size, GFP_KERNEL);
1da177e4 5562 if (!p) {
b6fe17d6 5563 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
1da177e4
LT
5564 return NULL;
5565 }
1da177e4 5566
1ce8e7b5 5567 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 5568 dev->padded = (char *)dev - (char *)p;
ab9c73cc 5569
29b4433d
ED
5570 dev->pcpu_refcnt = alloc_percpu(int);
5571 if (!dev->pcpu_refcnt)
e6484930 5572 goto free_p;
ab9c73cc 5573
ab9c73cc 5574 if (dev_addr_init(dev))
29b4433d 5575 goto free_pcpu;
ab9c73cc 5576
22bedad3 5577 dev_mc_init(dev);
a748ee24 5578 dev_uc_init(dev);
ccffad25 5579
c346dca1 5580 dev_net_set(dev, &init_net);
1da177e4 5581
e8a0464c 5582 dev->num_tx_queues = queue_count;
fd2ea0a7 5583 dev->real_num_tx_queues = queue_count;
e8a0464c 5584
df334545 5585#ifdef CONFIG_RPS
0a9627f2 5586 dev->num_rx_queues = queue_count;
62fe0b40 5587 dev->real_num_rx_queues = queue_count;
df334545 5588#endif
0a9627f2 5589
82cc1a7a 5590 dev->gso_max_size = GSO_MAX_SIZE;
1da177e4 5591
15682bc4
PWJ
5592 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5593 dev->ethtool_ntuple_list.count = 0;
d565b0a1 5594 INIT_LIST_HEAD(&dev->napi_list);
9fdce099 5595 INIT_LIST_HEAD(&dev->unreg_list);
e014debe 5596 INIT_LIST_HEAD(&dev->link_watch_list);
93f154b5 5597 dev->priv_flags = IFF_XMIT_DST_RELEASE;
1da177e4
LT
5598 setup(dev);
5599 strcpy(dev->name, name);
5600 return dev;
ab9c73cc 5601
29b4433d
ED
5602free_pcpu:
5603 free_percpu(dev->pcpu_refcnt);
ab9c73cc
JP
5604free_p:
5605 kfree(p);
5606 return NULL;
1da177e4 5607}
f25f4e44 5608EXPORT_SYMBOL(alloc_netdev_mq);
1da177e4
LT
5609
5610/**
5611 * free_netdev - free network device
5612 * @dev: device
5613 *
4ec93edb
YH
5614 * This function does the last stage of destroying an allocated device
5615 * interface. The reference to the device object is released.
1da177e4
LT
5616 * If this is the last reference then it will be freed.
5617 */
5618void free_netdev(struct net_device *dev)
5619{
d565b0a1
HX
5620 struct napi_struct *p, *n;
5621
f3005d7f
DL
5622 release_net(dev_net(dev));
5623
e8a0464c
DM
5624 kfree(dev->_tx);
5625
24824a09
ED
5626 kfree(rcu_dereference_raw(dev->ingress_queue));
5627
f001fde5
JP
5628 /* Flush device addresses */
5629 dev_addr_flush(dev);
5630
15682bc4
PWJ
5631 /* Clear ethtool n-tuple list */
5632 ethtool_ntuple_flush(dev);
5633
d565b0a1
HX
5634 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5635 netif_napi_del(p);
5636
29b4433d
ED
5637 free_percpu(dev->pcpu_refcnt);
5638 dev->pcpu_refcnt = NULL;
5639
3041a069 5640 /* Compatibility with error handling in drivers */
1da177e4
LT
5641 if (dev->reg_state == NETREG_UNINITIALIZED) {
5642 kfree((char *)dev - dev->padded);
5643 return;
5644 }
5645
5646 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5647 dev->reg_state = NETREG_RELEASED;
5648
43cb76d9
GKH
5649 /* will free via device release */
5650 put_device(&dev->dev);
1da177e4 5651}
d1b19dff 5652EXPORT_SYMBOL(free_netdev);
4ec93edb 5653
f0db275a
SH
5654/**
5655 * synchronize_net - Synchronize with packet receive processing
5656 *
5657 * Wait for packets currently being received to be done.
5658 * Does not block later packets from starting.
5659 */
4ec93edb 5660void synchronize_net(void)
1da177e4
LT
5661{
5662 might_sleep();
fbd568a3 5663 synchronize_rcu();
1da177e4 5664}
d1b19dff 5665EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
5666
5667/**
44a0873d 5668 * unregister_netdevice_queue - remove device from the kernel
1da177e4 5669 * @dev: device
44a0873d 5670 * @head: list
6ebfbc06 5671 *
1da177e4 5672 * This function shuts down a device interface and removes it
d59b54b1 5673 * from the kernel tables.
44a0873d 5674 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
5675 *
5676 * Callers must hold the rtnl semaphore. You may want
5677 * unregister_netdev() instead of this.
5678 */
5679
44a0873d 5680void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 5681{
a6620712
HX
5682 ASSERT_RTNL();
5683
44a0873d 5684 if (head) {
9fdce099 5685 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
5686 } else {
5687 rollback_registered(dev);
5688 /* Finish processing unregister after unlock */
5689 net_set_todo(dev);
5690 }
1da177e4 5691}
44a0873d 5692EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 5693
9b5e383c
ED
5694/**
5695 * unregister_netdevice_many - unregister many devices
5696 * @head: list of devices
9b5e383c
ED
5697 */
5698void unregister_netdevice_many(struct list_head *head)
5699{
5700 struct net_device *dev;
5701
5702 if (!list_empty(head)) {
5703 rollback_registered_many(head);
5704 list_for_each_entry(dev, head, unreg_list)
5705 net_set_todo(dev);
5706 }
5707}
63c8099d 5708EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 5709
1da177e4
LT
5710/**
5711 * unregister_netdev - remove device from the kernel
5712 * @dev: device
5713 *
5714 * This function shuts down a device interface and removes it
d59b54b1 5715 * from the kernel tables.
1da177e4
LT
5716 *
5717 * This is just a wrapper for unregister_netdevice that takes
5718 * the rtnl semaphore. In general you want to use this and not
5719 * unregister_netdevice.
5720 */
5721void unregister_netdev(struct net_device *dev)
5722{
5723 rtnl_lock();
5724 unregister_netdevice(dev);
5725 rtnl_unlock();
5726}
1da177e4
LT
5727EXPORT_SYMBOL(unregister_netdev);
5728
ce286d32
EB
5729/**
5730 * dev_change_net_namespace - move device to different nethost namespace
5731 * @dev: device
5732 * @net: network namespace
5733 * @pat: If not NULL name pattern to try if the current device name
5734 * is already taken in the destination network namespace.
5735 *
5736 * This function shuts down a device interface and moves it
5737 * to a new network namespace. On success 0 is returned, on
5738 * a failure a netagive errno code is returned.
5739 *
5740 * Callers must hold the rtnl semaphore.
5741 */
5742
5743int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5744{
ce286d32
EB
5745 int err;
5746
5747 ASSERT_RTNL();
5748
5749 /* Don't allow namespace local devices to be moved. */
5750 err = -EINVAL;
5751 if (dev->features & NETIF_F_NETNS_LOCAL)
5752 goto out;
5753
5754 /* Ensure the device has been registrered */
5755 err = -EINVAL;
5756 if (dev->reg_state != NETREG_REGISTERED)
5757 goto out;
5758
5759 /* Get out if there is nothing todo */
5760 err = 0;
878628fb 5761 if (net_eq(dev_net(dev), net))
ce286d32
EB
5762 goto out;
5763
5764 /* Pick the destination device name, and ensure
5765 * we can use it in the destination network namespace.
5766 */
5767 err = -EEXIST;
d9031024 5768 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
5769 /* We get here if we can't use the current device name */
5770 if (!pat)
5771 goto out;
8ce6cebc 5772 if (dev_get_valid_name(dev, pat, 1))
ce286d32
EB
5773 goto out;
5774 }
5775
5776 /*
5777 * And now a mini version of register_netdevice unregister_netdevice.
5778 */
5779
5780 /* If device is running close it first. */
9b772652 5781 dev_close(dev);
ce286d32
EB
5782
5783 /* And unlink it from device chain */
5784 err = -ENODEV;
5785 unlist_netdevice(dev);
5786
5787 synchronize_net();
5788
5789 /* Shutdown queueing discipline. */
5790 dev_shutdown(dev);
5791
5792 /* Notify protocols, that we are about to destroy
5793 this device. They should clean all the things.
3b27e105
DL
5794
5795 Note that dev->reg_state stays at NETREG_REGISTERED.
5796 This is wanted because this way 8021q and macvlan know
5797 the device is just moving and can keep their slaves up.
ce286d32
EB
5798 */
5799 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 5800 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
ce286d32
EB
5801
5802 /*
5803 * Flush the unicast and multicast chains
5804 */
a748ee24 5805 dev_uc_flush(dev);
22bedad3 5806 dev_mc_flush(dev);
ce286d32
EB
5807
5808 /* Actually switch the network namespace */
c346dca1 5809 dev_net_set(dev, net);
ce286d32 5810
ce286d32
EB
5811 /* If there is an ifindex conflict assign a new one */
5812 if (__dev_get_by_index(net, dev->ifindex)) {
5813 int iflink = (dev->iflink == dev->ifindex);
5814 dev->ifindex = dev_new_index(net);
5815 if (iflink)
5816 dev->iflink = dev->ifindex;
5817 }
5818
8b41d188 5819 /* Fixup kobjects */
a1b3f594 5820 err = device_rename(&dev->dev, dev->name);
8b41d188 5821 WARN_ON(err);
ce286d32
EB
5822
5823 /* Add the device back in the hashes */
5824 list_netdevice(dev);
5825
5826 /* Notify protocols, that a new device appeared. */
5827 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5828
d90a909e
EB
5829 /*
5830 * Prevent userspace races by waiting until the network
5831 * device is fully setup before sending notifications.
5832 */
5833 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5834
ce286d32
EB
5835 synchronize_net();
5836 err = 0;
5837out:
5838 return err;
5839}
463d0183 5840EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 5841
1da177e4
LT
5842static int dev_cpu_callback(struct notifier_block *nfb,
5843 unsigned long action,
5844 void *ocpu)
5845{
5846 struct sk_buff **list_skb;
1da177e4
LT
5847 struct sk_buff *skb;
5848 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5849 struct softnet_data *sd, *oldsd;
5850
8bb78442 5851 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
5852 return NOTIFY_OK;
5853
5854 local_irq_disable();
5855 cpu = smp_processor_id();
5856 sd = &per_cpu(softnet_data, cpu);
5857 oldsd = &per_cpu(softnet_data, oldcpu);
5858
5859 /* Find end of our completion_queue. */
5860 list_skb = &sd->completion_queue;
5861 while (*list_skb)
5862 list_skb = &(*list_skb)->next;
5863 /* Append completion queue from offline CPU. */
5864 *list_skb = oldsd->completion_queue;
5865 oldsd->completion_queue = NULL;
5866
1da177e4 5867 /* Append output queue from offline CPU. */
a9cbd588
CG
5868 if (oldsd->output_queue) {
5869 *sd->output_queue_tailp = oldsd->output_queue;
5870 sd->output_queue_tailp = oldsd->output_queue_tailp;
5871 oldsd->output_queue = NULL;
5872 oldsd->output_queue_tailp = &oldsd->output_queue;
5873 }
1da177e4
LT
5874
5875 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5876 local_irq_enable();
5877
5878 /* Process offline CPU's input_pkt_queue */
76cc8b13 5879 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
1da177e4 5880 netif_rx(skb);
76cc8b13 5881 input_queue_head_incr(oldsd);
fec5e652 5882 }
76cc8b13 5883 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6e7676c1 5884 netif_rx(skb);
76cc8b13
TH
5885 input_queue_head_incr(oldsd);
5886 }
1da177e4
LT
5887
5888 return NOTIFY_OK;
5889}
1da177e4
LT
5890
5891
7f353bf2 5892/**
b63365a2
HX
5893 * netdev_increment_features - increment feature set by one
5894 * @all: current feature set
5895 * @one: new feature set
5896 * @mask: mask feature set
7f353bf2
HX
5897 *
5898 * Computes a new feature set after adding a device with feature set
b63365a2
HX
5899 * @one to the master device with current feature set @all. Will not
5900 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 5901 */
b63365a2
HX
5902unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5903 unsigned long mask)
5904{
5905 /* If device needs checksumming, downgrade to it. */
d1b19dff 5906 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
b63365a2
HX
5907 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5908 else if (mask & NETIF_F_ALL_CSUM) {
5909 /* If one device supports v4/v6 checksumming, set for all. */
5910 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5911 !(all & NETIF_F_GEN_CSUM)) {
5912 all &= ~NETIF_F_ALL_CSUM;
5913 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5914 }
e2a6b852 5915
b63365a2
HX
5916 /* If one device supports hw checksumming, set for all. */
5917 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5918 all &= ~NETIF_F_ALL_CSUM;
5919 all |= NETIF_F_HW_CSUM;
5920 }
5921 }
7f353bf2 5922
b63365a2 5923 one |= NETIF_F_ALL_CSUM;
7f353bf2 5924
b63365a2 5925 one |= all & NETIF_F_ONE_FOR_ALL;
d9f5950f 5926 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
b63365a2 5927 all |= one & mask & NETIF_F_ONE_FOR_ALL;
7f353bf2
HX
5928
5929 return all;
5930}
b63365a2 5931EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 5932
30d97d35
PE
5933static struct hlist_head *netdev_create_hash(void)
5934{
5935 int i;
5936 struct hlist_head *hash;
5937
5938 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5939 if (hash != NULL)
5940 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5941 INIT_HLIST_HEAD(&hash[i]);
5942
5943 return hash;
5944}
5945
881d966b 5946/* Initialize per network namespace state */
4665079c 5947static int __net_init netdev_init(struct net *net)
881d966b 5948{
881d966b 5949 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 5950
30d97d35
PE
5951 net->dev_name_head = netdev_create_hash();
5952 if (net->dev_name_head == NULL)
5953 goto err_name;
881d966b 5954
30d97d35
PE
5955 net->dev_index_head = netdev_create_hash();
5956 if (net->dev_index_head == NULL)
5957 goto err_idx;
881d966b
EB
5958
5959 return 0;
30d97d35
PE
5960
5961err_idx:
5962 kfree(net->dev_name_head);
5963err_name:
5964 return -ENOMEM;
881d966b
EB
5965}
5966
f0db275a
SH
5967/**
5968 * netdev_drivername - network driver for the device
5969 * @dev: network device
5970 * @buffer: buffer for resulting name
5971 * @len: size of buffer
5972 *
5973 * Determine network driver for device.
5974 */
cf04a4c7 5975char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6579e57b 5976{
cf04a4c7
SH
5977 const struct device_driver *driver;
5978 const struct device *parent;
6579e57b
AV
5979
5980 if (len <= 0 || !buffer)
5981 return buffer;
5982 buffer[0] = 0;
5983
5984 parent = dev->dev.parent;
5985
5986 if (!parent)
5987 return buffer;
5988
5989 driver = parent->driver;
5990 if (driver && driver->name)
5991 strlcpy(buffer, driver->name, len);
5992 return buffer;
5993}
5994
256df2f3
JP
5995static int __netdev_printk(const char *level, const struct net_device *dev,
5996 struct va_format *vaf)
5997{
5998 int r;
5999
6000 if (dev && dev->dev.parent)
6001 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6002 netdev_name(dev), vaf);
6003 else if (dev)
6004 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6005 else
6006 r = printk("%s(NULL net_device): %pV", level, vaf);
6007
6008 return r;
6009}
6010
6011int netdev_printk(const char *level, const struct net_device *dev,
6012 const char *format, ...)
6013{
6014 struct va_format vaf;
6015 va_list args;
6016 int r;
6017
6018 va_start(args, format);
6019
6020 vaf.fmt = format;
6021 vaf.va = &args;
6022
6023 r = __netdev_printk(level, dev, &vaf);
6024 va_end(args);
6025
6026 return r;
6027}
6028EXPORT_SYMBOL(netdev_printk);
6029
6030#define define_netdev_printk_level(func, level) \
6031int func(const struct net_device *dev, const char *fmt, ...) \
6032{ \
6033 int r; \
6034 struct va_format vaf; \
6035 va_list args; \
6036 \
6037 va_start(args, fmt); \
6038 \
6039 vaf.fmt = fmt; \
6040 vaf.va = &args; \
6041 \
6042 r = __netdev_printk(level, dev, &vaf); \
6043 va_end(args); \
6044 \
6045 return r; \
6046} \
6047EXPORT_SYMBOL(func);
6048
6049define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6050define_netdev_printk_level(netdev_alert, KERN_ALERT);
6051define_netdev_printk_level(netdev_crit, KERN_CRIT);
6052define_netdev_printk_level(netdev_err, KERN_ERR);
6053define_netdev_printk_level(netdev_warn, KERN_WARNING);
6054define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6055define_netdev_printk_level(netdev_info, KERN_INFO);
6056
4665079c 6057static void __net_exit netdev_exit(struct net *net)
881d966b
EB
6058{
6059 kfree(net->dev_name_head);
6060 kfree(net->dev_index_head);
6061}
6062
022cbae6 6063static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
6064 .init = netdev_init,
6065 .exit = netdev_exit,
6066};
6067
4665079c 6068static void __net_exit default_device_exit(struct net *net)
ce286d32 6069{
e008b5fc 6070 struct net_device *dev, *aux;
ce286d32 6071 /*
e008b5fc 6072 * Push all migratable network devices back to the
ce286d32
EB
6073 * initial network namespace
6074 */
6075 rtnl_lock();
e008b5fc 6076 for_each_netdev_safe(net, dev, aux) {
ce286d32 6077 int err;
aca51397 6078 char fb_name[IFNAMSIZ];
ce286d32
EB
6079
6080 /* Ignore unmoveable devices (i.e. loopback) */
6081 if (dev->features & NETIF_F_NETNS_LOCAL)
6082 continue;
6083
e008b5fc
EB
6084 /* Leave virtual devices for the generic cleanup */
6085 if (dev->rtnl_link_ops)
6086 continue;
d0c082ce 6087
ce286d32 6088 /* Push remaing network devices to init_net */
aca51397
PE
6089 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6090 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 6091 if (err) {
aca51397 6092 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
ce286d32 6093 __func__, dev->name, err);
aca51397 6094 BUG();
ce286d32
EB
6095 }
6096 }
6097 rtnl_unlock();
6098}
6099
04dc7f6b
EB
6100static void __net_exit default_device_exit_batch(struct list_head *net_list)
6101{
6102 /* At exit all network devices most be removed from a network
6103 * namespace. Do this in the reverse order of registeration.
6104 * Do this across as many network namespaces as possible to
6105 * improve batching efficiency.
6106 */
6107 struct net_device *dev;
6108 struct net *net;
6109 LIST_HEAD(dev_kill_list);
6110
6111 rtnl_lock();
6112 list_for_each_entry(net, net_list, exit_list) {
6113 for_each_netdev_reverse(net, dev) {
6114 if (dev->rtnl_link_ops)
6115 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6116 else
6117 unregister_netdevice_queue(dev, &dev_kill_list);
6118 }
6119 }
6120 unregister_netdevice_many(&dev_kill_list);
6121 rtnl_unlock();
6122}
6123
022cbae6 6124static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 6125 .exit = default_device_exit,
04dc7f6b 6126 .exit_batch = default_device_exit_batch,
ce286d32
EB
6127};
6128
1da177e4
LT
6129/*
6130 * Initialize the DEV module. At boot time this walks the device list and
6131 * unhooks any devices that fail to initialise (normally hardware not
6132 * present) and leaves us with a valid list of present and active devices.
6133 *
6134 */
6135
6136/*
6137 * This is called single threaded during boot, so no need
6138 * to take the rtnl semaphore.
6139 */
6140static int __init net_dev_init(void)
6141{
6142 int i, rc = -ENOMEM;
6143
6144 BUG_ON(!dev_boot_phase);
6145
1da177e4
LT
6146 if (dev_proc_init())
6147 goto out;
6148
8b41d188 6149 if (netdev_kobject_init())
1da177e4
LT
6150 goto out;
6151
6152 INIT_LIST_HEAD(&ptype_all);
82d8a867 6153 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
6154 INIT_LIST_HEAD(&ptype_base[i]);
6155
881d966b
EB
6156 if (register_pernet_subsys(&netdev_net_ops))
6157 goto out;
1da177e4
LT
6158
6159 /*
6160 * Initialise the packet receive queues.
6161 */
6162
6f912042 6163 for_each_possible_cpu(i) {
e36fa2f7 6164 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 6165
dee42870 6166 memset(sd, 0, sizeof(*sd));
e36fa2f7 6167 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 6168 skb_queue_head_init(&sd->process_queue);
e36fa2f7
ED
6169 sd->completion_queue = NULL;
6170 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588
CG
6171 sd->output_queue = NULL;
6172 sd->output_queue_tailp = &sd->output_queue;
df334545 6173#ifdef CONFIG_RPS
e36fa2f7
ED
6174 sd->csd.func = rps_trigger_softirq;
6175 sd->csd.info = sd;
6176 sd->csd.flags = 0;
6177 sd->cpu = i;
1e94d72f 6178#endif
0a9627f2 6179
e36fa2f7
ED
6180 sd->backlog.poll = process_backlog;
6181 sd->backlog.weight = weight_p;
6182 sd->backlog.gro_list = NULL;
6183 sd->backlog.gro_count = 0;
1da177e4
LT
6184 }
6185
1da177e4
LT
6186 dev_boot_phase = 0;
6187
505d4f73
EB
6188 /* The loopback device is special if any other network devices
6189 * is present in a network namespace the loopback device must
6190 * be present. Since we now dynamically allocate and free the
6191 * loopback device ensure this invariant is maintained by
6192 * keeping the loopback device as the first device on the
6193 * list of network devices. Ensuring the loopback devices
6194 * is the first device that appears and the last network device
6195 * that disappears.
6196 */
6197 if (register_pernet_device(&loopback_net_ops))
6198 goto out;
6199
6200 if (register_pernet_device(&default_device_ops))
6201 goto out;
6202
962cf36c
CM
6203 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6204 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
6205
6206 hotcpu_notifier(dev_cpu_callback, 0);
6207 dst_init();
6208 dev_mcast_init();
6209 rc = 0;
6210out:
6211 return rc;
6212}
6213
6214subsys_initcall(net_dev_init);
6215
e88721f8
KK
6216static int __init initialize_hashrnd(void)
6217{
0a9627f2 6218 get_random_bytes(&hashrnd, sizeof(hashrnd));
e88721f8
KK
6219 return 0;
6220}
6221
6222late_initcall_sync(initialize_hashrnd);
6223
This page took 1.143427 seconds and 5 git commands to generate.