[deliverable/linux.git] / net / sched / sch_tbf.c

/*
 * net/sched/sch_tbf.c	Token Bucket Filter queue.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
 *						 original idea by Martin Devera
 *
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>


/*	Simple Token Bucket Filter.
	=======================================

	SOURCE.
	-------

	None.

	Description.
	------------

	A data flow obeys TBF with rate R and depth B, if for any
	time interval t_i...t_f the number of transmitted bits
	does not exceed B + R*(t_f-t_i).

	Packetized version of this definition:
	The sequence of packets of sizes s_i served at moments t_i
	obeys TBF, if for any i<=k:

	s_i+....+s_k <= B + R*(t_k - t_i)

	Algorithm.
	----------

	Let N(t_i) be B/R initially and N(t) grow continuously with time as:

	N(t+delta) = min{B/R, N(t) + delta}

	If the first packet in queue has length S, it may be
	transmitted only at the time t_* when S/R <= N(t_*),
	and in this case N(t) jumps:

	N(t_* + 0) = N(t_* - 0) - S/R.


	Actually, QoS requires two TBF to be applied to a data stream.
	One of them controls steady state burst size, another
	one with rate P (peak rate) and depth M (equal to link MTU)
	limits bursts at a smaller time scale.

	It is easy to see that P>R, and B>M. If P is infinity, this double
	TBF is equivalent to a single one.

	When TBF works in reshaping mode, latency is estimated as:

	lat = max ((L-B)/R, (L-M)/P)


	NOTES.
	------

	If TBF throttles, it starts a watchdog timer, which will wake it up
	when it is ready to transmit.
	Note that the minimal timer resolution is 1/HZ.
	If no new packets arrive during this period,
	or if the device is not awaken by EOI for some previous packet,
	TBF can stop its activity for 1/HZ.


	This means, that with depth B, the maximal rate is

	R_crit = B*HZ

	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.

	Note that the peak rate TBF is much more tough: with MTU 1500
	P_crit = 150Kbytes/sec. So, if you need greater peak
	rates, use alpha with HZ=1000 :-)

	With classful TBF, limit is just kept for backwards compatibility.
	It is passed to the default bfifo qdisc - if the inner qdisc is
	changed the limit is not effective anymore.
*/

struct tbf_sched_data {
/* Parameters */
	u32		limit;		/* Maximal length of backlog: bytes */
	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
	s64		mtu;
	u32		max_size;
	struct psched_ratecfg rate;
	struct psched_ratecfg peak;
	bool peak_present;

/* Variables */
	s64	tokens;			/* Current number of B tokens */
	s64	ptokens;		/* Current number of P tokens */
	s64	t_c;			/* Time check-point */
	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
	struct qdisc_watchdog watchdog;	/* Watchdog timer */
};


/* GSO packet is too big, segment it so that tbf can transmit
 * each segment in time
 */
static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct sk_buff *segs, *nskb;
	netdev_features_t features = netif_skb_features(skb);
	int ret, nb;

	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);

	if (IS_ERR_OR_NULL(segs))
		return qdisc_reshape_fail(skb, sch);

	nb = 0;
	while (segs) {
		nskb = segs->next;
		segs->next = NULL;
		if (likely(segs->len <= q->max_size)) {
			qdisc_skb_cb(segs)->pkt_len = segs->len;
			ret = qdisc_enqueue(segs, q->qdisc);
		} else {
			ret = qdisc_reshape_fail(skb, sch);
		}
		if (ret != NET_XMIT_SUCCESS) {
			if (net_xmit_drop_count(ret))
				sch->qstats.drops++;
		} else {
			nb++;
		}
		segs = nskb;
	}
	sch->q.qlen += nb;
	if (nb > 1)
		qdisc_tree_decrease_qlen(sch, 1 - nb);
	consume_skb(skb);
	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}

static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	int ret;

	if (qdisc_pkt_len(skb) > q->max_size) {
		if (skb_is_gso(skb))
			return tbf_segment(skb, sch);
		return qdisc_reshape_fail(skb, sch);
	}
	ret = qdisc_enqueue(skb, q->qdisc);
	if (ret != NET_XMIT_SUCCESS) {
		if (net_xmit_drop_count(ret))
			sch->qstats.drops++;
		return ret;
	}

	sch->q.qlen++;
	return NET_XMIT_SUCCESS;
}

static unsigned int tbf_drop(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	unsigned int len = 0;

	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
		sch->q.qlen--;
		sch->qstats.drops++;
	}
	return len;
}

static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct sk_buff *skb;

	skb = q->qdisc->ops->peek(q->qdisc);

	if (skb) {
		s64 now;
		s64 toks;
		s64 ptoks = 0;
		unsigned int len = qdisc_pkt_len(skb);

		now = ktime_to_ns(ktime_get());
		toks = min_t(s64, now - q->t_c, q->buffer);

		if (q->peak_present) {
			ptoks = toks + q->ptokens;
			if (ptoks > q->mtu)
				ptoks = q->mtu;
			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
		}
		toks += q->tokens;
		if (toks > q->buffer)
			toks = q->buffer;
		toks -= (s64) psched_l2t_ns(&q->rate, len);

		if ((toks|ptoks) >= 0) {
			skb = qdisc_dequeue_peeked(q->qdisc);
			if (unlikely(!skb))
				return NULL;

			q->t_c = now;
			q->tokens = toks;
			q->ptokens = ptoks;
			sch->q.qlen--;
			qdisc_unthrottled(sch);
			qdisc_bstats_update(sch, skb);
			return skb;
		}

		qdisc_watchdog_schedule_ns(&q->watchdog,
					   now + max_t(long, -toks, -ptoks));

		/* Maybe we have a shorter packet in the queue,
		   which can be sent now. It sounds cool,
		   but, however, this is wrong in principle.
		   We MUST NOT reorder packets under these circumstances.

		   Really, if we split the flow into independent
		   subflows, it would be a very good solution.
		   This is the main idea of all FQ algorithms
		   (cf. CSZ, HPFQ, HFSC)
		 */

		sch->qstats.overlimits++;
	}
	return NULL;
}

static void tbf_reset(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	qdisc_reset(q->qdisc);
	sch->q.qlen = 0;
	q->t_c = ktime_to_ns(ktime_get());
	q->tokens = q->buffer;
	q->ptokens = q->mtu;
	qdisc_watchdog_cancel(&q->watchdog);
}

static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
};

static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
{
	int err;
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct nlattr *tb[TCA_TBF_PTAB + 1];
	struct tc_tbf_qopt *qopt;
	struct qdisc_rate_table *rtab = NULL;
	struct qdisc_rate_table *ptab = NULL;
	struct Qdisc *child = NULL;
	int max_size, n;

	err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
	if (err < 0)
		return err;

	err = -EINVAL;
	if (tb[TCA_TBF_PARMS] == NULL)
		goto done;

	qopt = nla_data(tb[TCA_TBF_PARMS]);
	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
	if (rtab == NULL)
		goto done;

	if (qopt->peakrate.rate) {
		if (qopt->peakrate.rate > qopt->rate.rate)
			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
		if (ptab == NULL)
			goto done;
	}

	for (n = 0; n < 256; n++)
		if (rtab->data[n] > qopt->buffer)
			break;
	max_size = (n << qopt->rate.cell_log) - 1;
	if (ptab) {
		int size;

		for (n = 0; n < 256; n++)
			if (ptab->data[n] > qopt->mtu)
				break;
		size = (n << qopt->peakrate.cell_log) - 1;
		if (size < max_size)
			max_size = size;
	}
	if (max_size < 0)
		goto done;

	if (q->qdisc != &noop_qdisc) {
		err = fifo_set_limit(q->qdisc, qopt->limit);
		if (err)
			goto done;
	} else if (qopt->limit > 0) {
		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
		if (IS_ERR(child)) {
			err = PTR_ERR(child);
			goto done;
		}
	}

	sch_tree_lock(sch);
	if (child) {
		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
		qdisc_destroy(q->qdisc);
		q->qdisc = child;
	}
	q->limit = qopt->limit;
	q->mtu = PSCHED_TICKS2NS(qopt->mtu);
	q->max_size = max_size;
	q->buffer = PSCHED_TICKS2NS(qopt->buffer);
	q->tokens = q->buffer;
	q->ptokens = q->mtu;

	psched_ratecfg_precompute(&q->rate, &rtab->rate);
	if (ptab) {
		psched_ratecfg_precompute(&q->peak, &ptab->rate);
		q->peak_present = true;
	} else {
		q->peak_present = false;
	}

	sch_tree_unlock(sch);
	err = 0;
done:
	if (rtab)
		qdisc_put_rtab(rtab);
	if (ptab)
		qdisc_put_rtab(ptab);
	return err;
}

static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	if (opt == NULL)
		return -EINVAL;

	q->t_c = ktime_to_ns(ktime_get());
	qdisc_watchdog_init(&q->watchdog, sch);
	q->qdisc = &noop_qdisc;

	return tbf_change(sch, opt);
}

static void tbf_destroy(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	qdisc_watchdog_cancel(&q->watchdog);
	qdisc_destroy(q->qdisc);
}

static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct nlattr *nest;
	struct tc_tbf_qopt opt;

	sch->qstats.backlog = q->qdisc->qstats.backlog;
	nest = nla_nest_start(skb, TCA_OPTIONS);
	if (nest == NULL)
		goto nla_put_failure;

	opt.limit = q->limit;
	psched_ratecfg_getrate(&opt.rate, &q->rate);
	if (q->peak_present)
		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
	else
		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
	opt.mtu = PSCHED_NS2TICKS(q->mtu);
	opt.buffer = PSCHED_NS2TICKS(q->buffer);
	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
		goto nla_put_failure;

	nla_nest_end(skb, nest);
	return skb->len;

nla_put_failure:
	nla_nest_cancel(skb, nest);
	return -1;
}

static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
			  struct sk_buff *skb, struct tcmsg *tcm)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	tcm->tcm_handle |= TC_H_MIN(1);
	tcm->tcm_info = q->qdisc->handle;

	return 0;
}

static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
		     struct Qdisc **old)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	if (new == NULL)
		new = &noop_qdisc;

	sch_tree_lock(sch);
	*old = q->qdisc;
	q->qdisc = new;
	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
	qdisc_reset(*old);
	sch_tree_unlock(sch);

	return 0;
}

static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	return q->qdisc;
}

static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
{
	return 1;
}

static void tbf_put(struct Qdisc *sch, unsigned long arg)
{
}

static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
	if (!walker->stop) {
		if (walker->count >= walker->skip)
			if (walker->fn(sch, 1, walker) < 0) {
				walker->stop = 1;
				return;
			}
		walker->count++;
	}
}

static const struct Qdisc_class_ops tbf_class_ops = {
	.graft		=	tbf_graft,
	.leaf		=	tbf_leaf,
	.get		=	tbf_get,
	.put		=	tbf_put,
	.walk		=	tbf_walk,
	.dump		=	tbf_dump_class,
};

static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
	.next		=	NULL,
	.cl_ops		=	&tbf_class_ops,
	.id		=	"tbf",
	.priv_size	=	sizeof(struct tbf_sched_data),
	.enqueue	=	tbf_enqueue,
	.dequeue	=	tbf_dequeue,
	.peek		=	qdisc_peek_dequeued,
	.drop		=	tbf_drop,
	.init		=	tbf_init,
	.reset		=	tbf_reset,
	.destroy	=	tbf_destroy,
	.change		=	tbf_change,
	.dump		=	tbf_dump,
	.owner		=	THIS_MODULE,
};

static int __init tbf_module_init(void)
{
	return register_qdisc(&tbf_qdisc_ops);
}

static void __exit tbf_module_exit(void)
{
	unregister_qdisc(&tbf_qdisc_ops);
}
module_init(tbf_module_init)
module_exit(tbf_module_exit)
MODULE_LICENSE("GPL");
Commit	Line	Data
1da177e4 LT	1	/*
	2	* net/sched/sch_tbf.c Token Bucket Filter queue.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation; either version
	7	* 2 of the License, or (at your option) any later version.
	8	*
	9	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
	10	* Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
	11	* original idea by Martin Devera
	12	*
	13	*/
	14
1da177e4	15	#include <linux/module.h>
1da177e4 LT	16	#include <linux/types.h>
1da177e4 LT	17	#include <linux/kernel.h>
1da177e4	18	#include <linux/string.h>
1da177e4	19	#include <linux/errno.h>
1da177e4	20	#include <linux/skbuff.h>
0ba48053	21	#include <net/netlink.h>
b757c933	22	#include <net/sch_generic.h>
1da177e4 LT	23	#include <net/pkt_sched.h>
	24
	25
	26	/* Simple Token Bucket Filter.
	27	=======================================
	28
	29	SOURCE.
	30	-------
	31
	32	None.
	33
	34	Description.
	35	------------
	36
	37	A data flow obeys TBF with rate R and depth B, if for any
	38	time interval t_i...t_f the number of transmitted bits
	39	does not exceed B + R*(t_f-t_i).
	40
	41	Packetized version of this definition:
	42	The sequence of packets of sizes s_i served at moments t_i
	43	obeys TBF, if for any i<=k:
	44
	45	s_i+....+s_k <= B + R*(t_k - t_i)
	46
	47	Algorithm.
	48	----------
	49
	50	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
	51
	52	N(t+delta) = min{B/R, N(t) + delta}
	53
	54	If the first packet in queue has length S, it may be
	55	transmitted only at the time t_* when S/R <= N(t_*),
	56	and in this case N(t) jumps:
	57
	58	N(t_* + 0) = N(t_* - 0) - S/R.
	59
	60
	61
	62	Actually, QoS requires two TBF to be applied to a data stream.
	63	One of them controls steady state burst size, another
	64	one with rate P (peak rate) and depth M (equal to link MTU)
	65	limits bursts at a smaller time scale.
	66
	67	It is easy to see that P>R, and B>M. If P is infinity, this double
	68	TBF is equivalent to a single one.
	69
	70	When TBF works in reshaping mode, latency is estimated as:
	71
	72	lat = max ((L-B)/R, (L-M)/P)
	73
	74
	75	NOTES.
	76	------
	77
	78	If TBF throttles, it starts a watchdog timer, which will wake it up
	79	when it is ready to transmit.
	80	Note that the minimal timer resolution is 1/HZ.
	81	If no new packets arrive during this period,
	82	or if the device is not awaken by EOI for some previous packet,
	83	TBF can stop its activity for 1/HZ.
	84
	85
	86	This means, that with depth B, the maximal rate is
87
88	R_crit = B*HZ
89
90	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
91
92	Note that the peak rate TBF is much more tough: with MTU 1500
93	P_crit = 150Kbytes/sec. So, if you need greater peak
94	rates, use alpha with HZ=1000 :-)
95
96	With classful TBF, limit is just kept for backwards compatibility.
97	It is passed to the default bfifo qdisc - if the inner qdisc is
98	changed the limit is not effective anymore.
99	*/
100
cc7ec456	101	struct tbf_sched_data {
1da177e4 LT	102	/* Parameters */
1da177e4 LT	103	u32 limit; /* Maximal length of backlog: bytes */
b757c933 JP	104	s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
b757c933 JP	105	s64 mtu;
1da177e4	106	u32 max_size;
b757c933 JP	107	struct psched_ratecfg rate;
	108	struct psched_ratecfg peak;
	109	bool peak_present;
1da177e4 LT	110
1da177e4 LT	111	/* Variables */
b757c933 JP	112	s64 tokens; /* Current number of B tokens */
	113	s64 ptokens; /* Current number of P tokens */
	114	s64 t_c; /* Time check-point */
1da177e4	115	struct Qdisc qdisc; / Inner qdisc, default - bfifo queue */
f7f593e3	116	struct qdisc_watchdog watchdog; /* Watchdog timer */
1da177e4 LT	117	};
1da177e4 LT	118
e43ac79a ED	119
	120	/* GSO packet is too big, segment it so that tbf can transmit
	121	* each segment in time
	122	*/
	123	static int tbf_segment(struct sk_buff skb, struct Qdisc sch)
	124	{
	125	struct tbf_sched_data *q = qdisc_priv(sch);
	126	struct sk_buff segs, nskb;
	127	netdev_features_t features = netif_skb_features(skb);
	128	int ret, nb;
	129
	130	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
	131
	132	if (IS_ERR_OR_NULL(segs))
	133	return qdisc_reshape_fail(skb, sch);
	134
	135	nb = 0;
	136	while (segs) {
	137	nskb = segs->next;
	138	segs->next = NULL;
	139	if (likely(segs->len <= q->max_size)) {
	140	qdisc_skb_cb(segs)->pkt_len = segs->len;
	141	ret = qdisc_enqueue(segs, q->qdisc);
	142	} else {
	143	ret = qdisc_reshape_fail(skb, sch);
	144	}
	145	if (ret != NET_XMIT_SUCCESS) {
	146	if (net_xmit_drop_count(ret))
	147	sch->qstats.drops++;
	148	} else {
	149	nb++;
	150	}
	151	segs = nskb;
	152	}
	153	sch->q.qlen += nb;
	154	if (nb > 1)
	155	qdisc_tree_decrease_qlen(sch, 1 - nb);
	156	consume_skb(skb);
	157	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
	158	}
	159
cc7ec456	160	static int tbf_enqueue(struct sk_buff skb, struct Qdisc sch)
1da177e4 LT	161	{
	162	struct tbf_sched_data *q = qdisc_priv(sch);
	163	int ret;
	164
e43ac79a ED	165	if (qdisc_pkt_len(skb) > q->max_size) {
	166	if (skb_is_gso(skb))
	167	return tbf_segment(skb, sch);
69747650	168	return qdisc_reshape_fail(skb, sch);
e43ac79a	169	}
5f86173b	170	ret = qdisc_enqueue(skb, q->qdisc);
9871e50e	171	if (ret != NET_XMIT_SUCCESS) {
378a2f09 JP	172	if (net_xmit_drop_count(ret))
378a2f09 JP	173	sch->qstats.drops++;
1da177e4 LT	174	return ret;
	175	}
	176
	177	sch->q.qlen++;
9871e50e	178	return NET_XMIT_SUCCESS;
1da177e4 LT	179	}
1da177e4 LT	180
cc7ec456	181	static unsigned int tbf_drop(struct Qdisc *sch)
1da177e4 LT	182	{
1da177e4 LT	183	struct tbf_sched_data *q = qdisc_priv(sch);
6d037a26	184	unsigned int len = 0;
1da177e4	185
6d037a26	186	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
1da177e4 LT	187	sch->q.qlen--;
	188	sch->qstats.drops++;
	189	}
	190	return len;
	191	}
	192
cc7ec456	193	static struct sk_buff tbf_dequeue(struct Qdisc sch)
1da177e4 LT	194	{
	195	struct tbf_sched_data *q = qdisc_priv(sch);
	196	struct sk_buff *skb;
	197
03c05f0d	198	skb = q->qdisc->ops->peek(q->qdisc);
1da177e4 LT	199
1da177e4 LT	200	if (skb) {
b757c933 JP	201	s64 now;
	202	s64 toks;
	203	s64 ptoks = 0;
0abf77e5	204	unsigned int len = qdisc_pkt_len(skb);
1da177e4	205
b757c933 JP	206	now = ktime_to_ns(ktime_get());
b757c933 JP	207	toks = min_t(s64, now - q->t_c, q->buffer);
1da177e4	208
b757c933	209	if (q->peak_present) {
1da177e4	210	ptoks = toks + q->ptokens;
b757c933	211	if (ptoks > q->mtu)
1da177e4	212	ptoks = q->mtu;
b757c933	213	ptoks -= (s64) psched_l2t_ns(&q->peak, len);
1da177e4 LT	214	}
1da177e4 LT	215	toks += q->tokens;
b757c933	216	if (toks > q->buffer)
1da177e4	217	toks = q->buffer;
b757c933	218	toks -= (s64) psched_l2t_ns(&q->rate, len);
1da177e4 LT	219
1da177e4 LT	220	if ((toks\|ptoks) >= 0) {
77be155c	221	skb = qdisc_dequeue_peeked(q->qdisc);
03c05f0d JP	222	if (unlikely(!skb))
	223	return NULL;
	224
1da177e4 LT	225	q->t_c = now;
	226	q->tokens = toks;
	227	q->ptokens = ptoks;
	228	sch->q.qlen--;
fd245a4a	229	qdisc_unthrottled(sch);
9190b3b3	230	qdisc_bstats_update(sch, skb);
1da177e4 LT	231	return skb;
	232	}
	233
b757c933 JP	234	qdisc_watchdog_schedule_ns(&q->watchdog,
b757c933 JP	235	now + max_t(long, -toks, -ptoks));
1da177e4 LT	236
	237	/* Maybe we have a shorter packet in the queue,
	238	which can be sent now. It sounds cool,
	239	but, however, this is wrong in principle.
	240	We MUST NOT reorder packets under these circumstances.
	241
	242	Really, if we split the flow into independent
	243	subflows, it would be a very good solution.
	244	This is the main idea of all FQ algorithms
	245	(cf. CSZ, HPFQ, HFSC)
	246	*/
	247
1da177e4 LT	248	sch->qstats.overlimits++;
	249	}
	250	return NULL;
	251	}
	252
cc7ec456	253	static void tbf_reset(struct Qdisc *sch)
1da177e4 LT	254	{
	255	struct tbf_sched_data *q = qdisc_priv(sch);
	256
	257	qdisc_reset(q->qdisc);
	258	sch->q.qlen = 0;
b757c933	259	q->t_c = ktime_to_ns(ktime_get());
1da177e4 LT	260	q->tokens = q->buffer;
1da177e4 LT	261	q->ptokens = q->mtu;
f7f593e3	262	qdisc_watchdog_cancel(&q->watchdog);
1da177e4 LT	263	}
1da177e4 LT	264
27a3421e PM	265	static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
	266	[TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
	267	[TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	268	[TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	269	};
	270
cc7ec456	271	static int tbf_change(struct Qdisc sch, struct nlattr opt)
1da177e4	272	{
cee63723	273	int err;
1da177e4	274	struct tbf_sched_data *q = qdisc_priv(sch);
1e90474c	275	struct nlattr *tb[TCA_TBF_PTAB + 1];
1da177e4 LT	276	struct tc_tbf_qopt *qopt;
	277	struct qdisc_rate_table *rtab = NULL;
	278	struct qdisc_rate_table *ptab = NULL;
	279	struct Qdisc *child = NULL;
cc7ec456	280	int max_size, n;
1da177e4	281
27a3421e	282	err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
cee63723 PM	283	if (err < 0)
	284	return err;
	285
	286	err = -EINVAL;
27a3421e	287	if (tb[TCA_TBF_PARMS] == NULL)
1da177e4 LT	288	goto done;
1da177e4 LT	289
1e90474c PM	290	qopt = nla_data(tb[TCA_TBF_PARMS]);
1e90474c PM	291	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
1da177e4 LT	292	if (rtab == NULL)
	293	goto done;
	294
	295	if (qopt->peakrate.rate) {
	296	if (qopt->peakrate.rate > qopt->rate.rate)
1e90474c	297	ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
1da177e4 LT	298	if (ptab == NULL)
	299	goto done;
	300	}
	301
	302	for (n = 0; n < 256; n++)
cc7ec456 ED	303	if (rtab->data[n] > qopt->buffer)
	304	break;
	305	max_size = (n << qopt->rate.cell_log) - 1;
1da177e4 LT	306	if (ptab) {
	307	int size;
	308
	309	for (n = 0; n < 256; n++)
cc7ec456 ED	310	if (ptab->data[n] > qopt->mtu)
	311	break;
	312	size = (n << qopt->peakrate.cell_log) - 1;
	313	if (size < max_size)
	314	max_size = size;
1da177e4 LT	315	}
	316	if (max_size < 0)
	317	goto done;
	318
f0cd1508	319	if (q->qdisc != &noop_qdisc) {
	320	err = fifo_set_limit(q->qdisc, qopt->limit);
	321	if (err)
	322	goto done;
	323	} else if (qopt->limit > 0) {
fb0305ce PM	324	child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
	325	if (IS_ERR(child)) {
	326	err = PTR_ERR(child);
1da177e4	327	goto done;
fb0305ce	328	}
1da177e4 LT	329	}
	330
	331	sch_tree_lock(sch);
5e50da01 PM	332	if (child) {
5e50da01 PM	333	qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
b94c8afc PM	334	qdisc_destroy(q->qdisc);
b94c8afc PM	335	q->qdisc = child;
5e50da01	336	}
1da177e4	337	q->limit = qopt->limit;
b757c933	338	q->mtu = PSCHED_TICKS2NS(qopt->mtu);
1da177e4	339	q->max_size = max_size;
b757c933	340	q->buffer = PSCHED_TICKS2NS(qopt->buffer);
1da177e4 LT	341	q->tokens = q->buffer;
1da177e4 LT	342	q->ptokens = q->mtu;
b94c8afc	343
01cb71d2	344	psched_ratecfg_precompute(&q->rate, &rtab->rate);
b757c933	345	if (ptab) {
01cb71d2	346	psched_ratecfg_precompute(&q->peak, &ptab->rate);
b757c933 JP	347	q->peak_present = true;
	348	} else {
	349	q->peak_present = false;
	350	}
b94c8afc	351
1da177e4 LT	352	sch_tree_unlock(sch);
	353	err = 0;
	354	done:
	355	if (rtab)
	356	qdisc_put_rtab(rtab);
	357	if (ptab)
	358	qdisc_put_rtab(ptab);
	359	return err;
	360	}
	361
cc7ec456	362	static int tbf_init(struct Qdisc sch, struct nlattr opt)
1da177e4 LT	363	{
	364	struct tbf_sched_data *q = qdisc_priv(sch);
	365
	366	if (opt == NULL)
	367	return -EINVAL;
	368
b757c933	369	q->t_c = ktime_to_ns(ktime_get());
f7f593e3	370	qdisc_watchdog_init(&q->watchdog, sch);
1da177e4 LT	371	q->qdisc = &noop_qdisc;
	372
	373	return tbf_change(sch, opt);
	374	}
	375
	376	static void tbf_destroy(struct Qdisc *sch)
	377	{
	378	struct tbf_sched_data *q = qdisc_priv(sch);
	379
f7f593e3	380	qdisc_watchdog_cancel(&q->watchdog);
1da177e4 LT	381	qdisc_destroy(q->qdisc);
	382	}
	383
	384	static int tbf_dump(struct Qdisc sch, struct sk_buff skb)
	385	{
	386	struct tbf_sched_data *q = qdisc_priv(sch);
4b3550ef	387	struct nlattr *nest;
1da177e4 LT	388	struct tc_tbf_qopt opt;
1da177e4 LT	389
b0460e44	390	sch->qstats.backlog = q->qdisc->qstats.backlog;
4b3550ef PM	391	nest = nla_nest_start(skb, TCA_OPTIONS);
	392	if (nest == NULL)
	393	goto nla_put_failure;
1da177e4 LT	394
1da177e4 LT	395	opt.limit = q->limit;
01cb71d2	396	psched_ratecfg_getrate(&opt.rate, &q->rate);
b757c933	397	if (q->peak_present)
01cb71d2	398	psched_ratecfg_getrate(&opt.peakrate, &q->peak);
1da177e4 LT	399	else
1da177e4 LT	400	memset(&opt.peakrate, 0, sizeof(opt.peakrate));
b757c933 JP	401	opt.mtu = PSCHED_NS2TICKS(q->mtu);
b757c933 JP	402	opt.buffer = PSCHED_NS2TICKS(q->buffer);
1b34ec43 DM	403	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
1b34ec43 DM	404	goto nla_put_failure;
1da177e4	405
4b3550ef	406	nla_nest_end(skb, nest);
1da177e4 LT	407	return skb->len;
1da177e4 LT	408
1e90474c	409	nla_put_failure:
4b3550ef	410	nla_nest_cancel(skb, nest);
1da177e4 LT	411	return -1;
	412	}
	413
	414	static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
	415	struct sk_buff skb, struct tcmsg tcm)
	416	{
	417	struct tbf_sched_data *q = qdisc_priv(sch);
	418
1da177e4 LT	419	tcm->tcm_handle \|= TC_H_MIN(1);
	420	tcm->tcm_info = q->qdisc->handle;
	421
	422	return 0;
	423	}
	424
	425	static int tbf_graft(struct Qdisc sch, unsigned long arg, struct Qdisc new,
	426	struct Qdisc **old)
	427	{
	428	struct tbf_sched_data *q = qdisc_priv(sch);
	429
	430	if (new == NULL)
	431	new = &noop_qdisc;
	432
	433	sch_tree_lock(sch);
b94c8afc PM	434	*old = q->qdisc;
b94c8afc PM	435	q->qdisc = new;
5e50da01	436	qdisc_tree_decrease_qlen(old, (old)->q.qlen);
1da177e4	437	qdisc_reset(*old);
1da177e4 LT	438	sch_tree_unlock(sch);
	439
	440	return 0;
	441	}
	442
	443	static struct Qdisc tbf_leaf(struct Qdisc sch, unsigned long arg)
	444	{
	445	struct tbf_sched_data *q = qdisc_priv(sch);
	446	return q->qdisc;
	447	}
	448
	449	static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
	450	{
	451	return 1;
	452	}
	453
	454	static void tbf_put(struct Qdisc *sch, unsigned long arg)
	455	{
	456	}
	457
1da177e4 LT	458	static void tbf_walk(struct Qdisc sch, struct qdisc_walker walker)
	459	{
	460	if (!walker->stop) {
	461	if (walker->count >= walker->skip)
	462	if (walker->fn(sch, 1, walker) < 0) {
	463	walker->stop = 1;
	464	return;
	465	}
	466	walker->count++;
	467	}
	468	}
	469
cc7ec456	470	static const struct Qdisc_class_ops tbf_class_ops = {
1da177e4 LT	471	.graft = tbf_graft,
	472	.leaf = tbf_leaf,
	473	.get = tbf_get,
	474	.put = tbf_put,
1da177e4	475	.walk = tbf_walk,
1da177e4 LT	476	.dump = tbf_dump_class,
	477	};
	478
20fea08b	479	static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
1da177e4 LT	480	.next = NULL,
	481	.cl_ops = &tbf_class_ops,
	482	.id = "tbf",
	483	.priv_size = sizeof(struct tbf_sched_data),
	484	.enqueue = tbf_enqueue,
	485	.dequeue = tbf_dequeue,
77be155c	486	.peek = qdisc_peek_dequeued,
1da177e4 LT	487	.drop = tbf_drop,
	488	.init = tbf_init,
	489	.reset = tbf_reset,
	490	.destroy = tbf_destroy,
	491	.change = tbf_change,
	492	.dump = tbf_dump,
	493	.owner = THIS_MODULE,
	494	};
	495
	496	static int __init tbf_module_init(void)
	497	{
	498	return register_qdisc(&tbf_qdisc_ops);
	499	}
	500
	501	static void __exit tbf_module_exit(void)
	502	{
	503	unregister_qdisc(&tbf_qdisc_ops);
	504	}
	505	module_init(tbf_module_init)
	506	module_exit(tbf_module_exit)
	507	MODULE_LICENSE("GPL");