Merge ARM fixes
[deliverable/linux.git] / net / sched / sch_api.c
1 /*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/proc_fs.h>
33 #include <linux/seq_file.h>
34 #include <linux/kmod.h>
35 #include <linux/list.h>
36 #include <linux/bitops.h>
37
38 #include <net/sock.h>
39 #include <net/pkt_sched.h>
40
41 #include <asm/processor.h>
42 #include <asm/uaccess.h>
43 #include <asm/system.h>
44
45 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
46 struct Qdisc *old, struct Qdisc *new);
47 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
48 struct Qdisc *q, unsigned long cl, int event);
49
50 /*
51
52 Short review.
53 -------------
54
55 This file consists of two interrelated parts:
56
57 1. queueing disciplines manager frontend.
58 2. traffic classes manager frontend.
59
60 Generally, queueing discipline ("qdisc") is a black box,
61 which is able to enqueue packets and to dequeue them (when
62 device is ready to send something) in order and at times
63 determined by algorithm hidden in it.
64
65 qdisc's are divided to two categories:
66 - "queues", which have no internal structure visible from outside.
67 - "schedulers", which split all the packets to "traffic classes",
68 using "packet classifiers" (look at cls_api.c)
69
70 In turn, classes may have child qdiscs (as rule, queues)
71 attached to them etc. etc. etc.
72
73 The goal of the routines in this file is to translate
74 information supplied by user in the form of handles
75 to more intelligible for kernel form, to make some sanity
76 checks and part of work, which is common to all qdiscs
77 and to provide rtnetlink notifications.
78
79 All real intelligent work is done inside qdisc modules.
80
81
82
83 Every discipline has two major routines: enqueue and dequeue.
84
85 ---dequeue
86
87 dequeue usually returns a skb to send. It is allowed to return NULL,
88 but it does not mean that queue is empty, it just means that
89 discipline does not want to send anything this time.
90 Queue is really empty if q->q.qlen == 0.
91 For complicated disciplines with multiple queues q->q is not
92 real packet queue, but however q->q.qlen must be valid.
93
94 ---enqueue
95
96 enqueue returns 0, if packet was enqueued successfully.
97 If packet (this one or another one) was dropped, it returns
98 not zero error code.
99 NET_XMIT_DROP - this packet dropped
100 Expected action: do not backoff, but wait until queue will clear.
101 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
102 Expected action: backoff or ignore
103 NET_XMIT_POLICED - dropped by police.
104 Expected action: backoff or error to real-time apps.
105
106 Auxiliary routines:
107
108 ---requeue
109
110 requeues once dequeued packet. It is used for non-standard or
111 just buggy devices, which can defer output even if dev->tbusy=0.
112
113 ---reset
114
115 returns qdisc to initial state: purge all buffers, clear all
116 timers, counters (except for statistics) etc.
117
118 ---init
119
120 initializes newly created qdisc.
121
122 ---destroy
123
124 destroys resources allocated by init and during lifetime of qdisc.
125
126 ---change
127
128 changes qdisc parameters.
129 */
130
131 /* Protects list of registered TC modules. It is pure SMP lock. */
132 static DEFINE_RWLOCK(qdisc_mod_lock);
133
134
135 /************************************************
136 * Queueing disciplines manipulation. *
137 ************************************************/
138
139
140 /* The list of all installed queueing disciplines. */
141
142 static struct Qdisc_ops *qdisc_base;
143
144 /* Register/uregister queueing discipline */
145
146 int register_qdisc(struct Qdisc_ops *qops)
147 {
148 struct Qdisc_ops *q, **qp;
149 int rc = -EEXIST;
150
151 write_lock(&qdisc_mod_lock);
152 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
153 if (!strcmp(qops->id, q->id))
154 goto out;
155
156 if (qops->enqueue == NULL)
157 qops->enqueue = noop_qdisc_ops.enqueue;
158 if (qops->requeue == NULL)
159 qops->requeue = noop_qdisc_ops.requeue;
160 if (qops->dequeue == NULL)
161 qops->dequeue = noop_qdisc_ops.dequeue;
162
163 qops->next = NULL;
164 *qp = qops;
165 rc = 0;
166 out:
167 write_unlock(&qdisc_mod_lock);
168 return rc;
169 }
170
171 int unregister_qdisc(struct Qdisc_ops *qops)
172 {
173 struct Qdisc_ops *q, **qp;
174 int err = -ENOENT;
175
176 write_lock(&qdisc_mod_lock);
177 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
178 if (q == qops)
179 break;
180 if (q) {
181 *qp = q->next;
182 q->next = NULL;
183 err = 0;
184 }
185 write_unlock(&qdisc_mod_lock);
186 return err;
187 }
188
189 /* We know handle. Find qdisc among all qdisc's attached to device
190 (root qdisc, all its children, children of children etc.)
191 */
192
193 static struct Qdisc *__qdisc_lookup(struct net_device *dev, u32 handle)
194 {
195 struct Qdisc *q;
196
197 list_for_each_entry(q, &dev->qdisc_list, list) {
198 if (q->handle == handle)
199 return q;
200 }
201 return NULL;
202 }
203
204 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
205 {
206 struct Qdisc *q;
207
208 read_lock(&qdisc_tree_lock);
209 q = __qdisc_lookup(dev, handle);
210 read_unlock(&qdisc_tree_lock);
211 return q;
212 }
213
214 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
215 {
216 unsigned long cl;
217 struct Qdisc *leaf;
218 struct Qdisc_class_ops *cops = p->ops->cl_ops;
219
220 if (cops == NULL)
221 return NULL;
222 cl = cops->get(p, classid);
223
224 if (cl == 0)
225 return NULL;
226 leaf = cops->leaf(p, cl);
227 cops->put(p, cl);
228 return leaf;
229 }
230
231 /* Find queueing discipline by name */
232
233 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
234 {
235 struct Qdisc_ops *q = NULL;
236
237 if (kind) {
238 read_lock(&qdisc_mod_lock);
239 for (q = qdisc_base; q; q = q->next) {
240 if (rtattr_strcmp(kind, q->id) == 0) {
241 if (!try_module_get(q->owner))
242 q = NULL;
243 break;
244 }
245 }
246 read_unlock(&qdisc_mod_lock);
247 }
248 return q;
249 }
250
251 static struct qdisc_rate_table *qdisc_rtab_list;
252
253 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
254 {
255 struct qdisc_rate_table *rtab;
256
257 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
258 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
259 rtab->refcnt++;
260 return rtab;
261 }
262 }
263
264 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
265 return NULL;
266
267 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
268 if (rtab) {
269 rtab->rate = *r;
270 rtab->refcnt = 1;
271 memcpy(rtab->data, RTA_DATA(tab), 1024);
272 rtab->next = qdisc_rtab_list;
273 qdisc_rtab_list = rtab;
274 }
275 return rtab;
276 }
277
278 void qdisc_put_rtab(struct qdisc_rate_table *tab)
279 {
280 struct qdisc_rate_table *rtab, **rtabp;
281
282 if (!tab || --tab->refcnt)
283 return;
284
285 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
286 if (rtab == tab) {
287 *rtabp = rtab->next;
288 kfree(rtab);
289 return;
290 }
291 }
292 }
293
294
295 /* Allocate an unique handle from space managed by kernel */
296
297 static u32 qdisc_alloc_handle(struct net_device *dev)
298 {
299 int i = 0x10000;
300 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
301
302 do {
303 autohandle += TC_H_MAKE(0x10000U, 0);
304 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
305 autohandle = TC_H_MAKE(0x80000000U, 0);
306 } while (qdisc_lookup(dev, autohandle) && --i > 0);
307
308 return i>0 ? autohandle : 0;
309 }
310
311 /* Attach toplevel qdisc to device dev */
312
313 static struct Qdisc *
314 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
315 {
316 struct Qdisc *oqdisc;
317
318 if (dev->flags & IFF_UP)
319 dev_deactivate(dev);
320
321 qdisc_lock_tree(dev);
322 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
323 oqdisc = dev->qdisc_ingress;
324 /* Prune old scheduler */
325 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
326 /* delete */
327 qdisc_reset(oqdisc);
328 dev->qdisc_ingress = NULL;
329 } else { /* new */
330 dev->qdisc_ingress = qdisc;
331 }
332
333 } else {
334
335 oqdisc = dev->qdisc_sleeping;
336
337 /* Prune old scheduler */
338 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
339 qdisc_reset(oqdisc);
340
341 /* ... and graft new one */
342 if (qdisc == NULL)
343 qdisc = &noop_qdisc;
344 dev->qdisc_sleeping = qdisc;
345 dev->qdisc = &noop_qdisc;
346 }
347
348 qdisc_unlock_tree(dev);
349
350 if (dev->flags & IFF_UP)
351 dev_activate(dev);
352
353 return oqdisc;
354 }
355
356 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
357 {
358 struct Qdisc_class_ops *cops;
359 unsigned long cl;
360 u32 parentid;
361
362 if (n == 0)
363 return;
364 while ((parentid = sch->parent)) {
365 sch = __qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
366 cops = sch->ops->cl_ops;
367 if (cops->qlen_notify) {
368 cl = cops->get(sch, parentid);
369 cops->qlen_notify(sch, cl);
370 cops->put(sch, cl);
371 }
372 sch->q.qlen -= n;
373 }
374 }
375 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
376
377 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
378 to device "dev".
379
380 Old qdisc is not destroyed but returned in *old.
381 */
382
383 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
384 u32 classid,
385 struct Qdisc *new, struct Qdisc **old)
386 {
387 int err = 0;
388 struct Qdisc *q = *old;
389
390
391 if (parent == NULL) {
392 if (q && q->flags&TCQ_F_INGRESS) {
393 *old = dev_graft_qdisc(dev, q);
394 } else {
395 *old = dev_graft_qdisc(dev, new);
396 }
397 } else {
398 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
399
400 err = -EINVAL;
401
402 if (cops) {
403 unsigned long cl = cops->get(parent, classid);
404 if (cl) {
405 err = cops->graft(parent, cl, new, old);
406 if (new)
407 new->parent = classid;
408 cops->put(parent, cl);
409 }
410 }
411 }
412 return err;
413 }
414
415 /*
416 Allocate and initialize new qdisc.
417
418 Parameters are passed via opt.
419 */
420
421 static struct Qdisc *
422 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
423 {
424 int err;
425 struct rtattr *kind = tca[TCA_KIND-1];
426 struct Qdisc *sch;
427 struct Qdisc_ops *ops;
428
429 ops = qdisc_lookup_ops(kind);
430 #ifdef CONFIG_KMOD
431 if (ops == NULL && kind != NULL) {
432 char name[IFNAMSIZ];
433 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
434 /* We dropped the RTNL semaphore in order to
435 * perform the module load. So, even if we
436 * succeeded in loading the module we have to
437 * tell the caller to replay the request. We
438 * indicate this using -EAGAIN.
439 * We replay the request because the device may
440 * go away in the mean time.
441 */
442 rtnl_unlock();
443 request_module("sch_%s", name);
444 rtnl_lock();
445 ops = qdisc_lookup_ops(kind);
446 if (ops != NULL) {
447 /* We will try again qdisc_lookup_ops,
448 * so don't keep a reference.
449 */
450 module_put(ops->owner);
451 err = -EAGAIN;
452 goto err_out;
453 }
454 }
455 }
456 #endif
457
458 err = -ENOENT;
459 if (ops == NULL)
460 goto err_out;
461
462 sch = qdisc_alloc(dev, ops);
463 if (IS_ERR(sch)) {
464 err = PTR_ERR(sch);
465 goto err_out2;
466 }
467
468 if (handle == TC_H_INGRESS) {
469 sch->flags |= TCQ_F_INGRESS;
470 handle = TC_H_MAKE(TC_H_INGRESS, 0);
471 } else if (handle == 0) {
472 handle = qdisc_alloc_handle(dev);
473 err = -ENOMEM;
474 if (handle == 0)
475 goto err_out3;
476 }
477
478 sch->handle = handle;
479
480 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
481 #ifdef CONFIG_NET_ESTIMATOR
482 if (tca[TCA_RATE-1]) {
483 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
484 sch->stats_lock,
485 tca[TCA_RATE-1]);
486 if (err) {
487 /*
488 * Any broken qdiscs that would require
489 * a ops->reset() here? The qdisc was never
490 * in action so it shouldn't be necessary.
491 */
492 if (ops->destroy)
493 ops->destroy(sch);
494 goto err_out3;
495 }
496 }
497 #endif
498 qdisc_lock_tree(dev);
499 list_add_tail(&sch->list, &dev->qdisc_list);
500 qdisc_unlock_tree(dev);
501
502 return sch;
503 }
504 err_out3:
505 dev_put(dev);
506 kfree((char *) sch - sch->padded);
507 err_out2:
508 module_put(ops->owner);
509 err_out:
510 *errp = err;
511 return NULL;
512 }
513
514 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
515 {
516 if (tca[TCA_OPTIONS-1]) {
517 int err;
518
519 if (sch->ops->change == NULL)
520 return -EINVAL;
521 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
522 if (err)
523 return err;
524 }
525 #ifdef CONFIG_NET_ESTIMATOR
526 if (tca[TCA_RATE-1])
527 gen_replace_estimator(&sch->bstats, &sch->rate_est,
528 sch->stats_lock, tca[TCA_RATE-1]);
529 #endif
530 return 0;
531 }
532
533 struct check_loop_arg
534 {
535 struct qdisc_walker w;
536 struct Qdisc *p;
537 int depth;
538 };
539
540 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
541
542 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
543 {
544 struct check_loop_arg arg;
545
546 if (q->ops->cl_ops == NULL)
547 return 0;
548
549 arg.w.stop = arg.w.skip = arg.w.count = 0;
550 arg.w.fn = check_loop_fn;
551 arg.depth = depth;
552 arg.p = p;
553 q->ops->cl_ops->walk(q, &arg.w);
554 return arg.w.stop ? -ELOOP : 0;
555 }
556
557 static int
558 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
559 {
560 struct Qdisc *leaf;
561 struct Qdisc_class_ops *cops = q->ops->cl_ops;
562 struct check_loop_arg *arg = (struct check_loop_arg *)w;
563
564 leaf = cops->leaf(q, cl);
565 if (leaf) {
566 if (leaf == arg->p || arg->depth > 7)
567 return -ELOOP;
568 return check_loop(leaf, arg->p, arg->depth + 1);
569 }
570 return 0;
571 }
572
573 /*
574 * Delete/get qdisc.
575 */
576
577 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
578 {
579 struct tcmsg *tcm = NLMSG_DATA(n);
580 struct rtattr **tca = arg;
581 struct net_device *dev;
582 u32 clid = tcm->tcm_parent;
583 struct Qdisc *q = NULL;
584 struct Qdisc *p = NULL;
585 int err;
586
587 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
588 return -ENODEV;
589
590 if (clid) {
591 if (clid != TC_H_ROOT) {
592 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
593 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
594 return -ENOENT;
595 q = qdisc_leaf(p, clid);
596 } else { /* ingress */
597 q = dev->qdisc_ingress;
598 }
599 } else {
600 q = dev->qdisc_sleeping;
601 }
602 if (!q)
603 return -ENOENT;
604
605 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
606 return -EINVAL;
607 } else {
608 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
609 return -ENOENT;
610 }
611
612 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
613 return -EINVAL;
614
615 if (n->nlmsg_type == RTM_DELQDISC) {
616 if (!clid)
617 return -EINVAL;
618 if (q->handle == 0)
619 return -ENOENT;
620 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
621 return err;
622 if (q) {
623 qdisc_notify(skb, n, clid, q, NULL);
624 spin_lock_bh(&dev->queue_lock);
625 qdisc_destroy(q);
626 spin_unlock_bh(&dev->queue_lock);
627 }
628 } else {
629 qdisc_notify(skb, n, clid, NULL, q);
630 }
631 return 0;
632 }
633
634 /*
635 Create/change qdisc.
636 */
637
638 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
639 {
640 struct tcmsg *tcm;
641 struct rtattr **tca;
642 struct net_device *dev;
643 u32 clid;
644 struct Qdisc *q, *p;
645 int err;
646
647 replay:
648 /* Reinit, just in case something touches this. */
649 tcm = NLMSG_DATA(n);
650 tca = arg;
651 clid = tcm->tcm_parent;
652 q = p = NULL;
653
654 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
655 return -ENODEV;
656
657 if (clid) {
658 if (clid != TC_H_ROOT) {
659 if (clid != TC_H_INGRESS) {
660 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
661 return -ENOENT;
662 q = qdisc_leaf(p, clid);
663 } else { /*ingress */
664 q = dev->qdisc_ingress;
665 }
666 } else {
667 q = dev->qdisc_sleeping;
668 }
669
670 /* It may be default qdisc, ignore it */
671 if (q && q->handle == 0)
672 q = NULL;
673
674 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
675 if (tcm->tcm_handle) {
676 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
677 return -EEXIST;
678 if (TC_H_MIN(tcm->tcm_handle))
679 return -EINVAL;
680 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
681 goto create_n_graft;
682 if (n->nlmsg_flags&NLM_F_EXCL)
683 return -EEXIST;
684 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
685 return -EINVAL;
686 if (q == p ||
687 (p && check_loop(q, p, 0)))
688 return -ELOOP;
689 atomic_inc(&q->refcnt);
690 goto graft;
691 } else {
692 if (q == NULL)
693 goto create_n_graft;
694
695 /* This magic test requires explanation.
696 *
697 * We know, that some child q is already
698 * attached to this parent and have choice:
699 * either to change it or to create/graft new one.
700 *
701 * 1. We are allowed to create/graft only
702 * if CREATE and REPLACE flags are set.
703 *
704 * 2. If EXCL is set, requestor wanted to say,
705 * that qdisc tcm_handle is not expected
706 * to exist, so that we choose create/graft too.
707 *
708 * 3. The last case is when no flags are set.
709 * Alas, it is sort of hole in API, we
710 * cannot decide what to do unambiguously.
711 * For now we select create/graft, if
712 * user gave KIND, which does not match existing.
713 */
714 if ((n->nlmsg_flags&NLM_F_CREATE) &&
715 (n->nlmsg_flags&NLM_F_REPLACE) &&
716 ((n->nlmsg_flags&NLM_F_EXCL) ||
717 (tca[TCA_KIND-1] &&
718 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
719 goto create_n_graft;
720 }
721 }
722 } else {
723 if (!tcm->tcm_handle)
724 return -EINVAL;
725 q = qdisc_lookup(dev, tcm->tcm_handle);
726 }
727
728 /* Change qdisc parameters */
729 if (q == NULL)
730 return -ENOENT;
731 if (n->nlmsg_flags&NLM_F_EXCL)
732 return -EEXIST;
733 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
734 return -EINVAL;
735 err = qdisc_change(q, tca);
736 if (err == 0)
737 qdisc_notify(skb, n, clid, NULL, q);
738 return err;
739
740 create_n_graft:
741 if (!(n->nlmsg_flags&NLM_F_CREATE))
742 return -ENOENT;
743 if (clid == TC_H_INGRESS)
744 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
745 else
746 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
747 if (q == NULL) {
748 if (err == -EAGAIN)
749 goto replay;
750 return err;
751 }
752
753 graft:
754 if (1) {
755 struct Qdisc *old_q = NULL;
756 err = qdisc_graft(dev, p, clid, q, &old_q);
757 if (err) {
758 if (q) {
759 spin_lock_bh(&dev->queue_lock);
760 qdisc_destroy(q);
761 spin_unlock_bh(&dev->queue_lock);
762 }
763 return err;
764 }
765 qdisc_notify(skb, n, clid, old_q, q);
766 if (old_q) {
767 spin_lock_bh(&dev->queue_lock);
768 qdisc_destroy(old_q);
769 spin_unlock_bh(&dev->queue_lock);
770 }
771 }
772 return 0;
773 }
774
775 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
776 u32 pid, u32 seq, u16 flags, int event)
777 {
778 struct tcmsg *tcm;
779 struct nlmsghdr *nlh;
780 unsigned char *b = skb->tail;
781 struct gnet_dump d;
782
783 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
784 tcm = NLMSG_DATA(nlh);
785 tcm->tcm_family = AF_UNSPEC;
786 tcm->tcm__pad1 = 0;
787 tcm->tcm__pad2 = 0;
788 tcm->tcm_ifindex = q->dev->ifindex;
789 tcm->tcm_parent = clid;
790 tcm->tcm_handle = q->handle;
791 tcm->tcm_info = atomic_read(&q->refcnt);
792 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
793 if (q->ops->dump && q->ops->dump(q, skb) < 0)
794 goto rtattr_failure;
795 q->qstats.qlen = q->q.qlen;
796
797 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
798 TCA_XSTATS, q->stats_lock, &d) < 0)
799 goto rtattr_failure;
800
801 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
802 goto rtattr_failure;
803
804 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
805 #ifdef CONFIG_NET_ESTIMATOR
806 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
807 #endif
808 gnet_stats_copy_queue(&d, &q->qstats) < 0)
809 goto rtattr_failure;
810
811 if (gnet_stats_finish_copy(&d) < 0)
812 goto rtattr_failure;
813
814 nlh->nlmsg_len = skb->tail - b;
815 return skb->len;
816
817 nlmsg_failure:
818 rtattr_failure:
819 skb_trim(skb, b - skb->data);
820 return -1;
821 }
822
823 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
824 u32 clid, struct Qdisc *old, struct Qdisc *new)
825 {
826 struct sk_buff *skb;
827 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
828
829 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
830 if (!skb)
831 return -ENOBUFS;
832
833 if (old && old->handle) {
834 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
835 goto err_out;
836 }
837 if (new) {
838 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
839 goto err_out;
840 }
841
842 if (skb->len)
843 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
844
845 err_out:
846 kfree_skb(skb);
847 return -EINVAL;
848 }
849
850 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
851 {
852 int idx, q_idx;
853 int s_idx, s_q_idx;
854 struct net_device *dev;
855 struct Qdisc *q;
856
857 s_idx = cb->args[0];
858 s_q_idx = q_idx = cb->args[1];
859 read_lock(&dev_base_lock);
860 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
861 if (idx < s_idx)
862 continue;
863 if (idx > s_idx)
864 s_q_idx = 0;
865 read_lock(&qdisc_tree_lock);
866 q_idx = 0;
867 list_for_each_entry(q, &dev->qdisc_list, list) {
868 if (q_idx < s_q_idx) {
869 q_idx++;
870 continue;
871 }
872 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
873 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
874 read_unlock(&qdisc_tree_lock);
875 goto done;
876 }
877 q_idx++;
878 }
879 read_unlock(&qdisc_tree_lock);
880 }
881
882 done:
883 read_unlock(&dev_base_lock);
884
885 cb->args[0] = idx;
886 cb->args[1] = q_idx;
887
888 return skb->len;
889 }
890
891
892
893 /************************************************
894 * Traffic classes manipulation. *
895 ************************************************/
896
897
898
899 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
900 {
901 struct tcmsg *tcm = NLMSG_DATA(n);
902 struct rtattr **tca = arg;
903 struct net_device *dev;
904 struct Qdisc *q = NULL;
905 struct Qdisc_class_ops *cops;
906 unsigned long cl = 0;
907 unsigned long new_cl;
908 u32 pid = tcm->tcm_parent;
909 u32 clid = tcm->tcm_handle;
910 u32 qid = TC_H_MAJ(clid);
911 int err;
912
913 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
914 return -ENODEV;
915
916 /*
917 parent == TC_H_UNSPEC - unspecified parent.
918 parent == TC_H_ROOT - class is root, which has no parent.
919 parent == X:0 - parent is root class.
920 parent == X:Y - parent is a node in hierarchy.
921 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
922
923 handle == 0:0 - generate handle from kernel pool.
924 handle == 0:Y - class is X:Y, where X:0 is qdisc.
925 handle == X:Y - clear.
926 handle == X:0 - root class.
927 */
928
929 /* Step 1. Determine qdisc handle X:0 */
930
931 if (pid != TC_H_ROOT) {
932 u32 qid1 = TC_H_MAJ(pid);
933
934 if (qid && qid1) {
935 /* If both majors are known, they must be identical. */
936 if (qid != qid1)
937 return -EINVAL;
938 } else if (qid1) {
939 qid = qid1;
940 } else if (qid == 0)
941 qid = dev->qdisc_sleeping->handle;
942
943 /* Now qid is genuine qdisc handle consistent
944 both with parent and child.
945
946 TC_H_MAJ(pid) still may be unspecified, complete it now.
947 */
948 if (pid)
949 pid = TC_H_MAKE(qid, pid);
950 } else {
951 if (qid == 0)
952 qid = dev->qdisc_sleeping->handle;
953 }
954
955 /* OK. Locate qdisc */
956 if ((q = qdisc_lookup(dev, qid)) == NULL)
957 return -ENOENT;
958
959 /* An check that it supports classes */
960 cops = q->ops->cl_ops;
961 if (cops == NULL)
962 return -EINVAL;
963
964 /* Now try to get class */
965 if (clid == 0) {
966 if (pid == TC_H_ROOT)
967 clid = qid;
968 } else
969 clid = TC_H_MAKE(qid, clid);
970
971 if (clid)
972 cl = cops->get(q, clid);
973
974 if (cl == 0) {
975 err = -ENOENT;
976 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
977 goto out;
978 } else {
979 switch (n->nlmsg_type) {
980 case RTM_NEWTCLASS:
981 err = -EEXIST;
982 if (n->nlmsg_flags&NLM_F_EXCL)
983 goto out;
984 break;
985 case RTM_DELTCLASS:
986 err = cops->delete(q, cl);
987 if (err == 0)
988 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
989 goto out;
990 case RTM_GETTCLASS:
991 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
992 goto out;
993 default:
994 err = -EINVAL;
995 goto out;
996 }
997 }
998
999 new_cl = cl;
1000 err = cops->change(q, clid, pid, tca, &new_cl);
1001 if (err == 0)
1002 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1003
1004 out:
1005 if (cl)
1006 cops->put(q, cl);
1007
1008 return err;
1009 }
1010
1011
1012 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1013 unsigned long cl,
1014 u32 pid, u32 seq, u16 flags, int event)
1015 {
1016 struct tcmsg *tcm;
1017 struct nlmsghdr *nlh;
1018 unsigned char *b = skb->tail;
1019 struct gnet_dump d;
1020 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1021
1022 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1023 tcm = NLMSG_DATA(nlh);
1024 tcm->tcm_family = AF_UNSPEC;
1025 tcm->tcm_ifindex = q->dev->ifindex;
1026 tcm->tcm_parent = q->handle;
1027 tcm->tcm_handle = q->handle;
1028 tcm->tcm_info = 0;
1029 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1030 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1031 goto rtattr_failure;
1032
1033 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1034 TCA_XSTATS, q->stats_lock, &d) < 0)
1035 goto rtattr_failure;
1036
1037 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1038 goto rtattr_failure;
1039
1040 if (gnet_stats_finish_copy(&d) < 0)
1041 goto rtattr_failure;
1042
1043 nlh->nlmsg_len = skb->tail - b;
1044 return skb->len;
1045
1046 nlmsg_failure:
1047 rtattr_failure:
1048 skb_trim(skb, b - skb->data);
1049 return -1;
1050 }
1051
1052 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1053 struct Qdisc *q, unsigned long cl, int event)
1054 {
1055 struct sk_buff *skb;
1056 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1057
1058 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1059 if (!skb)
1060 return -ENOBUFS;
1061
1062 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1063 kfree_skb(skb);
1064 return -EINVAL;
1065 }
1066
1067 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1068 }
1069
1070 struct qdisc_dump_args
1071 {
1072 struct qdisc_walker w;
1073 struct sk_buff *skb;
1074 struct netlink_callback *cb;
1075 };
1076
1077 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1078 {
1079 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1080
1081 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1082 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1083 }
1084
1085 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1086 {
1087 int t;
1088 int s_t;
1089 struct net_device *dev;
1090 struct Qdisc *q;
1091 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1092 struct qdisc_dump_args arg;
1093
1094 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1095 return 0;
1096 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1097 return 0;
1098
1099 s_t = cb->args[0];
1100 t = 0;
1101
1102 read_lock(&qdisc_tree_lock);
1103 list_for_each_entry(q, &dev->qdisc_list, list) {
1104 if (t < s_t || !q->ops->cl_ops ||
1105 (tcm->tcm_parent &&
1106 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1107 t++;
1108 continue;
1109 }
1110 if (t > s_t)
1111 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1112 arg.w.fn = qdisc_class_dump;
1113 arg.skb = skb;
1114 arg.cb = cb;
1115 arg.w.stop = 0;
1116 arg.w.skip = cb->args[1];
1117 arg.w.count = 0;
1118 q->ops->cl_ops->walk(q, &arg.w);
1119 cb->args[1] = arg.w.count;
1120 if (arg.w.stop)
1121 break;
1122 t++;
1123 }
1124 read_unlock(&qdisc_tree_lock);
1125
1126 cb->args[0] = t;
1127
1128 dev_put(dev);
1129 return skb->len;
1130 }
1131
1132 /* Main classifier routine: scans classifier chain attached
1133 to this qdisc, (optionally) tests for protocol and asks
1134 specific classifiers.
1135 */
1136 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1137 struct tcf_result *res)
1138 {
1139 int err = 0;
1140 __be16 protocol = skb->protocol;
1141 #ifdef CONFIG_NET_CLS_ACT
1142 struct tcf_proto *otp = tp;
1143 reclassify:
1144 #endif
1145 protocol = skb->protocol;
1146
1147 for ( ; tp; tp = tp->next) {
1148 if ((tp->protocol == protocol ||
1149 tp->protocol == __constant_htons(ETH_P_ALL)) &&
1150 (err = tp->classify(skb, tp, res)) >= 0) {
1151 #ifdef CONFIG_NET_CLS_ACT
1152 if ( TC_ACT_RECLASSIFY == err) {
1153 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1154 tp = otp;
1155
1156 if (MAX_REC_LOOP < verd++) {
1157 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1158 tp->prio&0xffff, ntohs(tp->protocol));
1159 return TC_ACT_SHOT;
1160 }
1161 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1162 goto reclassify;
1163 } else {
1164 if (skb->tc_verd)
1165 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1166 return err;
1167 }
1168 #else
1169
1170 return err;
1171 #endif
1172 }
1173
1174 }
1175 return -1;
1176 }
1177
1178 static int psched_us_per_tick = 1;
1179 static int psched_tick_per_us = 1;
1180
1181 #ifdef CONFIG_PROC_FS
1182 static int psched_show(struct seq_file *seq, void *v)
1183 {
1184 seq_printf(seq, "%08x %08x %08x %08x\n",
1185 psched_tick_per_us, psched_us_per_tick,
1186 1000000, HZ);
1187
1188 return 0;
1189 }
1190
1191 static int psched_open(struct inode *inode, struct file *file)
1192 {
1193 return single_open(file, psched_show, PDE(inode)->data);
1194 }
1195
1196 static const struct file_operations psched_fops = {
1197 .owner = THIS_MODULE,
1198 .open = psched_open,
1199 .read = seq_read,
1200 .llseek = seq_lseek,
1201 .release = single_release,
1202 };
1203 #endif
1204
1205 #ifdef CONFIG_NET_SCH_CLK_CPU
1206 psched_tdiff_t psched_clock_per_hz;
1207 int psched_clock_scale;
1208 EXPORT_SYMBOL(psched_clock_per_hz);
1209 EXPORT_SYMBOL(psched_clock_scale);
1210
1211 psched_time_t psched_time_base;
1212 cycles_t psched_time_mark;
1213 EXPORT_SYMBOL(psched_time_mark);
1214 EXPORT_SYMBOL(psched_time_base);
1215
1216 /*
1217 * Periodically adjust psched_time_base to avoid overflow
1218 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1219 */
1220 static void psched_tick(unsigned long);
1221 static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
1222
1223 static void psched_tick(unsigned long dummy)
1224 {
1225 if (sizeof(cycles_t) == sizeof(u32)) {
1226 psched_time_t dummy_stamp;
1227 PSCHED_GET_TIME(dummy_stamp);
1228 psched_timer.expires = jiffies + 1*HZ;
1229 add_timer(&psched_timer);
1230 }
1231 }
1232
1233 int __init psched_calibrate_clock(void)
1234 {
1235 psched_time_t stamp, stamp1;
1236 struct timeval tv, tv1;
1237 psched_tdiff_t delay;
1238 long rdelay;
1239 unsigned long stop;
1240
1241 psched_tick(0);
1242 stop = jiffies + HZ/10;
1243 PSCHED_GET_TIME(stamp);
1244 do_gettimeofday(&tv);
1245 while (time_before(jiffies, stop)) {
1246 barrier();
1247 cpu_relax();
1248 }
1249 PSCHED_GET_TIME(stamp1);
1250 do_gettimeofday(&tv1);
1251
1252 delay = PSCHED_TDIFF(stamp1, stamp);
1253 rdelay = tv1.tv_usec - tv.tv_usec;
1254 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1255 if (rdelay > delay)
1256 return -1;
1257 delay /= rdelay;
1258 psched_tick_per_us = delay;
1259 while ((delay>>=1) != 0)
1260 psched_clock_scale++;
1261 psched_us_per_tick = 1<<psched_clock_scale;
1262 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1263 return 0;
1264 }
1265 #endif
1266
1267 static int __init pktsched_init(void)
1268 {
1269 struct rtnetlink_link *link_p;
1270
1271 #ifdef CONFIG_NET_SCH_CLK_CPU
1272 if (psched_calibrate_clock() < 0)
1273 return -1;
1274 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1275 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1276 psched_us_per_tick = 1000000;
1277 #endif
1278
1279 link_p = rtnetlink_links[PF_UNSPEC];
1280
1281 /* Setup rtnetlink links. It is made here to avoid
1282 exporting large number of public symbols.
1283 */
1284
1285 if (link_p) {
1286 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1287 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1288 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1289 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1290 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1291 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1292 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1293 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1294 }
1295
1296 register_qdisc(&pfifo_qdisc_ops);
1297 register_qdisc(&bfifo_qdisc_ops);
1298 proc_net_fops_create("psched", 0, &psched_fops);
1299
1300 return 0;
1301 }
1302
1303 subsys_initcall(pktsched_init);
1304
1305 EXPORT_SYMBOL(qdisc_get_rtab);
1306 EXPORT_SYMBOL(qdisc_put_rtab);
1307 EXPORT_SYMBOL(register_qdisc);
1308 EXPORT_SYMBOL(unregister_qdisc);
1309 EXPORT_SYMBOL(tc_classify);
This page took 0.057262 seconds and 6 git commands to generate.