diff options
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/Kconfig | 2 | ||||
-rw-r--r-- | net/sched/act_api.c | 3 | ||||
-rw-r--r-- | net/sched/cls_api.c | 2 | ||||
-rw-r--r-- | net/sched/cls_cgroup.c | 24 | ||||
-rw-r--r-- | net/sched/sch_api.c | 20 | ||||
-rw-r--r-- | net/sched/sch_cbq.c | 3 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 11 | ||||
-rw-r--r-- | net/sched/sch_htb.c | 139 | ||||
-rw-r--r-- | net/sched/sch_mq.c | 4 | ||||
-rw-r--r-- | net/sched/sch_mqprio.c | 4 | ||||
-rw-r--r-- | net/sched/sch_qfq.c | 830 |
11 files changed, 717 insertions, 325 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 62fb51face8..235e01acac5 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -509,7 +509,7 @@ config NET_EMATCH_TEXT config NET_EMATCH_CANID tristate "CAN Identifier" - depends on NET_EMATCH && CAN + depends on NET_EMATCH && (CAN=y || CAN=m) ---help--- Say Y here if you want to be able to classify CAN frames based on CAN Identifier. diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 102761d294c..65d240cbf74 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -987,6 +987,9 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; + if ((n->nlmsg_type != RTM_GETACTION) && !capable(CAP_NET_ADMIN)) + return -EPERM; + ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); if (ret < 0) return ret; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 7ae02892437..ff55ed6c49b 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -139,6 +139,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) int err; int tp_created = 0; + if ((n->nlmsg_type != RTM_GETTFILTER) && !capable(CAP_NET_ADMIN)) + return -EPERM; replay: t = nlmsg_data(n); protocol = TC_H_MIN(t->tcm_info); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 31f06b63357..6db7855b902 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -17,6 +17,7 @@ #include <linux/skbuff.h> #include <linux/cgroup.h> #include <linux/rcupdate.h> +#include <linux/fdtable.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> @@ -57,6 +58,28 @@ static void cgrp_css_free(struct cgroup *cgrp) kfree(cgrp_cls_state(cgrp)); } +static int update_classid(const void *v, struct file *file, unsigned n) +{ + int err; + struct socket *sock = sock_from_file(file, &err); + if (sock) + sock->sk->sk_classid = (u32)(unsigned long)v; + return 0; +} + +static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +{ + struct task_struct *p; + void *v; + + cgroup_taskset_for_each(p, cgrp, tset) { + task_lock(p); + v = (void *)(unsigned long)task_cls_classid(p); + iterate_fd(p->files, 0, update_classid, v); + task_unlock(p); + } +} + static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) { return cgrp_cls_state(cgrp)->classid; @@ -82,6 +105,7 @@ struct cgroup_subsys net_cls_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, + .attach = cgrp_attach, .subsys_id = net_cls_subsys_id, .base_cftypes = ss_files, .module = THIS_MODULE, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a18d975db59..d84f7e734cd 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -495,16 +495,15 @@ EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) { - ktime_t time; - if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) return; qdisc_throttled(wd->qdisc); - time = ktime_set(0, 0); - time = ktime_add_ns(time, PSCHED_TICKS2NS(expires)); - hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS); + + hrtimer_start(&wd->timer, + ns_to_ktime(PSCHED_TICKS2NS(expires)), + HRTIMER_MODE_ABS); } EXPORT_SYMBOL(qdisc_watchdog_schedule); @@ -834,6 +833,8 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, goto err_out3; } lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); + if (!netif_is_multiqueue(dev)) + sch->flags |= TCQ_F_ONETXQUEUE; } sch->handle = handle; @@ -981,6 +982,9 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) struct Qdisc *p = NULL; int err; + if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN)) + return -EPERM; + dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; @@ -1044,6 +1048,9 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) struct Qdisc *q, *p; int err; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + replay: /* Reinit, just in case something touches this. */ tcm = nlmsg_data(n); @@ -1380,6 +1387,9 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) u32 qid = TC_H_MAJ(clid); int err; + if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN)) + return -EPERM; + dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 564b9fc8efd..0e19948470b 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -509,8 +509,7 @@ static void cbq_ovl_delay(struct cbq_class *cl) cl->cpriority = TC_CBQ_MAXPRIO; q->pmask |= (1<<TC_CBQ_MAXPRIO); - expires = ktime_set(0, 0); - expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched)); + expires = ns_to_ktime(PSCHED_TICKS2NS(sched)); if (hrtimer_try_to_cancel(&q->delay_timer) && ktime_to_ns(ktime_sub( hrtimer_get_expires(&q->delay_timer), diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index aefc1504dc8..5d81a447851 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -53,20 +53,19 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) static inline struct sk_buff *dequeue_skb(struct Qdisc *q) { struct sk_buff *skb = q->gso_skb; + const struct netdev_queue *txq = q->dev_queue; if (unlikely(skb)) { - struct net_device *dev = qdisc_dev(q); - struct netdev_queue *txq; - /* check the reason of requeuing without tx lock first */ - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb)); if (!netif_xmit_frozen_or_stopped(txq)) { q->gso_skb = NULL; q->q.qlen--; } else skb = NULL; } else { - skb = q->dequeue(q); + if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) + skb = q->dequeue(q); } return skb; @@ -686,6 +685,8 @@ static void attach_one_default_qdisc(struct net_device *dev, netdev_info(dev, "activation failed\n"); return; } + if (!netif_is_multiqueue(dev)) + qdisc->flags |= TCQ_F_ONETXQUEUE; } dev_queue->qdisc_sleeping = qdisc; } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 9d75b776131..d2922c0ef57 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -71,6 +71,12 @@ enum htb_cmode { HTB_CAN_SEND /* class can send */ }; +struct htb_rate_cfg { + u64 rate_bps; + u32 mult; + u32 shift; +}; + /* interior & leaf nodes; props specific to leaves are marked L: */ struct htb_class { struct Qdisc_class_common common; @@ -118,11 +124,11 @@ struct htb_class { int filter_cnt; /* token bucket parameters */ - struct qdisc_rate_table *rate; /* rate table of the class itself */ - struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ - long buffer, cbuffer; /* token bucket depth/rate */ + struct htb_rate_cfg rate; + struct htb_rate_cfg ceil; + s64 buffer, cbuffer; /* token bucket depth/rate */ psched_tdiff_t mbuffer; /* max wait time */ - long tokens, ctokens; /* current number of tokens */ + s64 tokens, ctokens; /* current number of tokens */ psched_time_t t_c; /* checkpoint time */ }; @@ -162,6 +168,45 @@ struct htb_sched { struct work_struct work; }; +static u64 l2t_ns(struct htb_rate_cfg *r, unsigned int len) +{ + return ((u64)len * r->mult) >> r->shift; +} + +static void htb_precompute_ratedata(struct htb_rate_cfg *r) +{ + u64 factor; + u64 mult; + int shift; + + r->shift = 0; + r->mult = 1; + /* + * Calibrate mult, shift so that token counting is accurate + * for smallest packet size (64 bytes). Token (time in ns) is + * computed as (bytes * 8) * NSEC_PER_SEC / rate_bps. It will + * work as long as the smallest packet transfer time can be + * accurately represented in nanosec. + */ + if (r->rate_bps > 0) { + /* + * Higher shift gives better accuracy. Find the largest + * shift such that mult fits in 32 bits. + */ + for (shift = 0; shift < 16; shift++) { + r->shift = shift; + factor = 8LLU * NSEC_PER_SEC * (1 << r->shift); + mult = div64_u64(factor, r->rate_bps); + if (mult > UINT_MAX) + break; + } + + r->shift = shift - 1; + factor = 8LLU * NSEC_PER_SEC * (1 << r->shift); + r->mult = div64_u64(factor, r->rate_bps); + } +} + /* find class in global hash table using given handle */ static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch) { @@ -273,7 +318,7 @@ static void htb_add_to_id_tree(struct rb_root *root, * already in the queue. */ static void htb_add_to_wait_tree(struct htb_sched *q, - struct htb_class *cl, long delay) + struct htb_class *cl, s64 delay) { struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; @@ -441,14 +486,14 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) htb_remove_class_from_row(q, cl, mask); } -static inline long htb_lowater(const struct htb_class *cl) +static inline s64 htb_lowater(const struct htb_class *cl) { if (htb_hysteresis) return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0; else return 0; } -static inline long htb_hiwater(const struct htb_class *cl) +static inline s64 htb_hiwater(const struct htb_class *cl) { if (htb_hysteresis) return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0; @@ -469,9 +514,9 @@ static inline long htb_hiwater(const struct htb_class *cl) * mode transitions per time unit. The speed gain is about 1/6. */ static inline enum htb_cmode -htb_class_mode(struct htb_class *cl, long *diff) +htb_class_mode(struct htb_class *cl, s64 *diff) { - long toks; + s64 toks; if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) { *diff = -toks; @@ -495,7 +540,7 @@ htb_class_mode(struct htb_class *cl, long *diff) * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). */ static void -htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) +htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) { enum htb_cmode new_mode = htb_class_mode(cl, diff); @@ -581,26 +626,26 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } -static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff) +static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff) { - long toks = diff + cl->tokens; + s64 toks = diff + cl->tokens; if (toks > cl->buffer) toks = cl->buffer; - toks -= (long) qdisc_l2t(cl->rate, bytes); + toks -= (s64) l2t_ns(&cl->rate, bytes); if (toks <= -cl->mbuffer) toks = 1 - cl->mbuffer; cl->tokens = toks; } -static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff) +static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff) { - long toks = diff + cl->ctokens; + s64 toks = diff + cl->ctokens; if (toks > cl->cbuffer) toks = cl->cbuffer; - toks -= (long) qdisc_l2t(cl->ceil, bytes); + toks -= (s64) l2t_ns(&cl->ceil, bytes); if (toks <= -cl->mbuffer) toks = 1 - cl->mbuffer; @@ -623,10 +668,10 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, { int bytes = qdisc_pkt_len(skb); enum htb_cmode old_mode; - long diff; + s64 diff; while (cl) { - diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); + diff = min_t(s64, q->now - cl->t_c, cl->mbuffer); if (cl->level >= level) { if (cl->level == level) cl->xstats.lends++; @@ -673,7 +718,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level, unsigned long stop_at = start + 2; while (time_before(jiffies, stop_at)) { struct htb_class *cl; - long diff; + s64 diff; struct rb_node *p = rb_first(&q->wait_pq[level]); if (!p) @@ -684,7 +729,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level, return cl->pq_key; htb_safe_rb_erase(p, q->wait_pq + level); - diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); + diff = min_t(s64, q->now - cl->t_c, cl->mbuffer); htb_change_class_mode(q, cl, &diff); if (cl->cmode != HTB_CAN_SEND) htb_add_to_wait_tree(q, cl, diff); @@ -871,10 +916,10 @@ ok: if (!sch->q.qlen) goto fin; - q->now = psched_get_time(); + q->now = ktime_to_ns(ktime_get()); start_at = jiffies; - next_event = q->now + 5 * PSCHED_TICKS_PER_SEC; + next_event = q->now + 5 * NSEC_PER_SEC; for (level = 0; level < TC_HTB_MAXDEPTH; level++) { /* common case optimization - skip event handler quickly */ @@ -884,7 +929,7 @@ ok: if (q->now >= q->near_ev_cache[level]) { event = htb_do_events(q, level, start_at); if (!event) - event = q->now + PSCHED_TICKS_PER_SEC; + event = q->now + NSEC_PER_SEC; q->near_ev_cache[level] = event; } else event = q->near_ev_cache[level]; @@ -903,10 +948,17 @@ ok: } } sch->qstats.overlimits++; - if (likely(next_event > q->now)) - qdisc_watchdog_schedule(&q->watchdog, next_event); - else + if (likely(next_event > q->now)) { + if (!test_bit(__QDISC_STATE_DEACTIVATED, + &qdisc_root_sleeping(q->watchdog.qdisc)->state)) { + ktime_t time = ns_to_ktime(next_event); + qdisc_throttled(q->watchdog.qdisc); + hrtimer_start(&q->watchdog.timer, time, + HRTIMER_MODE_ABS); + } + } else { schedule_work(&q->work); + } fin: return skb; } @@ -1082,9 +1134,9 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, memset(&opt, 0, sizeof(opt)); - opt.rate = cl->rate->rate; + opt.rate.rate = cl->rate.rate_bps >> 3; opt.buffer = cl->buffer; - opt.ceil = cl->ceil->rate; + opt.ceil.rate = cl->ceil.rate_bps >> 3; opt.cbuffer = cl->cbuffer; opt.quantum = cl->quantum; opt.prio = cl->prio; @@ -1203,9 +1255,6 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) qdisc_destroy(cl->un.leaf.q); } gen_kill_estimator(&cl->bstats, &cl->rate_est); - qdisc_put_rtab(cl->rate); - qdisc_put_rtab(cl->ceil); - tcf_destroy_chain(&cl->filter_list); kfree(cl); } @@ -1307,7 +1356,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; - struct qdisc_rate_table *rtab = NULL, *ctab = NULL; struct nlattr *tb[__TCA_HTB_MAX]; struct tc_htb_opt *hopt; @@ -1326,10 +1374,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch); hopt = nla_data(tb[TCA_HTB_PARMS]); - - rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]); - ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]); - if (!rtab || !ctab) + if (!hopt->rate.rate || !hopt->ceil.rate) goto failure; if (!cl) { /* new class */ @@ -1439,7 +1484,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, * is really leaf before changing cl->un.leaf ! */ if (!cl->level) { - cl->quantum = rtab->rate.rate / q->rate2quantum; + cl->quantum = hopt->rate.rate / q->rate2quantum; if (!hopt->quantum && cl->quantum < 1000) { pr_warning( "HTB: quantum of class %X is small. Consider r2q change.\n", @@ -1460,12 +1505,16 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->buffer = hopt->buffer; cl->cbuffer = hopt->cbuffer; - if (cl->rate) - qdisc_put_rtab(cl->rate); - cl->rate = rtab; - if (cl->ceil) - qdisc_put_rtab(cl->ceil); - cl->ceil = ctab; + + cl->rate.rate_bps = (u64)hopt->rate.rate << 3; + cl->ceil.rate_bps = (u64)hopt->ceil.rate << 3; + + htb_precompute_ratedata(&cl->rate); + htb_precompute_ratedata(&cl->ceil); + + cl->buffer = hopt->buffer << PSCHED_SHIFT; + cl->cbuffer = hopt->buffer << PSCHED_SHIFT; + sch_tree_unlock(sch); qdisc_class_hash_grow(sch, &q->clhash); @@ -1474,10 +1523,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, return 0; failure: - if (rtab) - qdisc_put_rtab(rtab); - if (ctab) - qdisc_put_rtab(ctab); return err; } diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 0a4b2f9a009..5da78a19ac9 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -63,6 +63,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) if (qdisc == NULL) goto err; priv->qdiscs[ntx] = qdisc; + qdisc->flags |= TCQ_F_ONETXQUEUE; } sch->flags |= TCQ_F_MQROOT; @@ -150,7 +151,8 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, dev_deactivate(dev); *old = dev_graft_qdisc(dev_queue, new); - + if (new) + new->flags |= TCQ_F_ONETXQUEUE; if (dev->flags & IFF_UP) dev_activate(dev); return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index d1831ca966d..accec33c454 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -132,6 +132,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) goto err; } priv->qdiscs[i] = qdisc; + qdisc->flags |= TCQ_F_ONETXQUEUE; } /* If the mqprio options indicate that hardware should own @@ -205,6 +206,9 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, *old = dev_graft_qdisc(dev_queue, new); + if (new) + new->flags |= TCQ_F_ONETXQUEUE; + if (dev->flags & IFF_UP) dev_activate(dev); diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 9687fa1c227..6ed37652a4c 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1,7 +1,8 @@ /* - * net/sched/sch_qfq.c Quick Fair Queueing Scheduler. + * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler. * * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. + * Copyright (c) 2012 Paolo Valente. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -19,12 +20,18 @@ #include <net/pkt_cls.h> -/* Quick Fair Queueing - =================== +/* Quick Fair Queueing Plus + ======================== Sources: - Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient + [1] Paolo Valente, + "Reducing the Execution Time of Fair-Queueing Schedulers." + http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf + + Sources for QFQ: + + [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient Packet Scheduling with Tight Bandwidth Distribution Guarantees." See also: @@ -33,6 +40,20 @@ /* + QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES + classes. Each aggregate is timestamped with a virtual start time S + and a virtual finish time F, and scheduled according to its + timestamps. S and F are computed as a function of a system virtual + time function V. The classes within each aggregate are instead + scheduled with DRR. + + To speed up operations, QFQ+ divides also aggregates into a limited + number of groups. Which group a class belongs to depends on the + ratio between the maximum packet length for the class and the weight + of the class. Groups have their own S and F. In the end, QFQ+ + schedules groups, then aggregates within groups, then classes within + aggregates. See [1] and [2] for a full description. + Virtual time computations. S, F and V are all computed in fixed point arithmetic with @@ -76,27 +97,28 @@ #define QFQ_MAX_SLOTS 32 /* - * Shifts used for class<->group mapping. We allow class weights that are - * in the range [1, 2^MAX_WSHIFT], and we try to map each class i to the + * Shifts used for aggregate<->group mapping. We allow class weights that are + * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the * group with the smallest index that can support the L_i / r_i configured - * for the class. + * for the classes in the aggregate. * * grp->index is the index of the group; and grp->slot_shift * is the shift for the corresponding (scaled) sigma_i. */ #define QFQ_MAX_INDEX 24 -#define QFQ_MAX_WSHIFT 12 +#define QFQ_MAX_WSHIFT 10 -#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) -#define QFQ_MAX_WSUM (16*QFQ_MAX_WEIGHT) +#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */ +#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT) #define FRAC_BITS 30 /* fixed point arithmetic */ #define ONE_FP (1UL << FRAC_BITS) #define IWSUM (ONE_FP/QFQ_MAX_WSUM) #define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ -#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX) -#define QFQ_MIN_LMAX 256 /* min possible lmax for a class */ +#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */ + +#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */ /* * Possible group states. These values are used as indexes for the bitmaps @@ -106,6 +128,8 @@ enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; struct qfq_group; +struct qfq_aggregate; + struct qfq_class { struct Qdisc_class_common common; @@ -116,7 +140,12 @@ struct qfq_class { struct gnet_stats_queue qstats; struct gnet_stats_rate_est rate_est; struct Qdisc *qdisc; + struct list_head alist; /* Link for active-classes list. */ + struct qfq_aggregate *agg; /* Parent aggregate. */ + int deficit; /* DRR deficit counter. */ +}; +struct qfq_aggregate { struct hlist_node next; /* Link for the slot list. */ u64 S, F; /* flow timestamps (exact) */ @@ -127,8 +156,18 @@ struct qfq_class { struct qfq_group *grp; /* these are copied from the flowset. */ - u32 inv_w; /* ONE_FP/weight */ - u32 lmax; /* Max packet size for this flow. */ + u32 class_weight; /* Weight of each class in this aggregate. */ + /* Max pkt size for the classes in this aggregate, DRR quantum. */ + int lmax; + + u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */ + u32 budgetmax; /* Max budget for this aggregate. */ + u32 initial_budget, budget; /* Initial and current budget. */ + + int num_classes; /* Number of classes in this aggr. */ + struct list_head active; /* DRR queue of active classes. */ + + struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */ }; struct qfq_group { @@ -138,7 +177,7 @@ struct qfq_group { unsigned int front; /* Index of the front slot. */ unsigned long full_slots; /* non-empty slots */ - /* Array of RR lists of active classes. */ + /* Array of RR lists of active aggregates. */ struct hlist_head slots[QFQ_MAX_SLOTS]; }; @@ -146,13 +185,28 @@ struct qfq_sched { struct tcf_proto *filter_list; struct Qdisc_class_hash clhash; - u64 V; /* Precise virtual time. */ - u32 wsum; /* weight sum */ + u64 oldV, V; /* Precise virtual times. */ + struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ + u32 num_active_agg; /* Num. of active aggregates */ + u32 wsum; /* weight sum */ unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ + u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */ + + u32 max_agg_classes; /* Max number of classes per aggr. */ + struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */ }; +/* + * Possible reasons why the timestamps of an aggregate are updated + * enqueue: the aggregate switches from idle to active and must scheduled + * for service + * requeue: the aggregate finishes its budget, so it stops being served and + * must be rescheduled for service + */ +enum update_reason {enqueue, requeue}; + static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) { struct qfq_sched *q = qdisc_priv(sch); @@ -182,18 +236,18 @@ static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { * index = log_2(maxlen/weight) but we need to apply the scaling. * This is used only once at flow creation. */ -static int qfq_calc_index(u32 inv_w, unsigned int maxlen) +static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift) { u64 slot_size = (u64)maxlen * inv_w; unsigned long size_map; int index = 0; - size_map = slot_size >> QFQ_MIN_SLOT_SHIFT; + size_map = slot_size >> min_slot_shift; if (!size_map) goto out; index = __fls(size_map) + 1; /* basically a log_2 */ - index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); + index -= !(slot_size - (1ULL << (index + min_slot_shift - 1))); if (index < 0) index = 0; @@ -204,66 +258,150 @@ out: return index; } -/* Length of the next packet (0 if the queue is empty). */ -static unsigned int qdisc_peek_len(struct Qdisc *sch) +static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *); +static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *, + enum update_reason); + +static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + u32 lmax, u32 weight) { - struct sk_buff *skb; + INIT_LIST_HEAD(&agg->active); + hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); + + agg->lmax = lmax; + agg->class_weight = weight; +} + +static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, + u32 lmax, u32 weight) +{ + struct qfq_aggregate *agg; + struct hlist_node *n; + + hlist_for_each_entry(agg, n, &q->nonfull_aggs, nonfull_next) + if (agg->lmax == lmax && agg->class_weight == weight) + return agg; + + return NULL; +} + - skb = sch->ops->peek(sch); - return skb ? qdisc_pkt_len(skb) : 0; +/* Update aggregate as a function of the new number of classes. */ +static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + int new_num_classes) +{ + u32 new_agg_weight; + + if (new_num_classes == q->max_agg_classes) + hlist_del_init(&agg->nonfull_next); + + if (agg->num_classes > new_num_classes && + new_num_classes == q->max_agg_classes - 1) /* agg no more full */ + hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); + + agg->budgetmax = new_num_classes * agg->lmax; + new_agg_weight = agg->class_weight * new_num_classes; + agg->inv_w = ONE_FP/new_agg_weight; + + if (agg->grp == NULL) { + int i = qfq_calc_index(agg->inv_w, agg->budgetmax, + q->min_slot_shift); + agg->grp = &q->groups[i]; + } + + q->wsum += + (int) agg->class_weight * (new_num_classes - agg->num_classes); + + agg->num_classes = new_num_classes; +} + +/* Add class to aggregate. */ +static void qfq_add_to_agg(struct qfq_sched *q, + struct qfq_aggregate *agg, + struct qfq_class *cl) +{ + cl->agg = agg; + + qfq_update_agg(q, agg, agg->num_classes+1); + if (cl->qdisc->q.qlen > 0) { /* adding an active class */ + list_add_tail(&cl->alist, &agg->active); + if (list_first_entry(&agg->active, struct qfq_class, alist) == + cl && q->in_serv_agg != agg) /* agg was inactive */ + qfq_activate_agg(q, agg, enqueue); /* schedule agg */ + } } -static void qfq_deactivate_class(struct qfq_sched *, struct qfq_class *); -static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, - unsigned int len); +static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); -static void qfq_update_class_params(struct qfq_sched *q, struct qfq_class *cl, - u32 lmax, u32 inv_w, int delta_w) +static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { - int i; + if (!hlist_unhashed(&agg->nonfull_next)) + hlist_del_init(&agg->nonfull_next); + if (q->in_serv_agg == agg) + q->in_serv_agg = qfq_choose_next_agg(q); + kfree(agg); +} - /* update qfq-specific data */ - cl->lmax = lmax; - cl->inv_w = inv_w; - i = qfq_calc_index(cl->inv_w, cl->lmax); +/* Deschedule class from within its parent aggregate. */ +static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) +{ + struct qfq_aggregate *agg = cl->agg; - cl->grp = &q->groups[i]; - q->wsum += delta_w; + list_del(&cl->alist); /* remove from RR queue of the aggregate */ + if (list_empty(&agg->active)) /* agg is now inactive */ + qfq_deactivate_agg(q, agg); } -static void qfq_update_reactivate_class(struct qfq_sched *q, - struct qfq_class *cl, - u32 inv_w, u32 lmax, int delta_w) +/* Remove class from its parent aggregate. */ +static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) { - bool need_reactivation = false; - int i = qfq_calc_index(inv_w, lmax); + struct qfq_aggregate *agg = cl->agg; - if (&q->groups[i] != cl->grp && cl->qdisc->q.qlen > 0) { - /* - * shift cl->F back, to not charge the - * class for the not-yet-served head - * packet - */ - cl->F = cl->S; - /* remove class from its slot in the old group */ - qfq_deactivate_class(q, cl); - need_reactivation = true; + cl->agg = NULL; + if (agg->num_classes == 1) { /* agg being emptied, destroy it */ + qfq_destroy_agg(q, agg); + return; } + qfq_update_agg(q, agg, agg->num_classes-1); +} - qfq_update_class_params(q, cl, lmax, inv_w, delta_w); +/* Deschedule class and remove it from its parent aggregate. */ +static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) +{ + if (cl->qdisc->q.qlen > 0) /* class is active */ + qfq_deactivate_class(q, cl); - if (need_reactivation) /* activate in new group */ - qfq_activate_class(q, cl, qdisc_peek_len(cl->qdisc)); + qfq_rm_from_agg(q, cl); } +/* Move class to a new aggregate, matching the new class weight and/or lmax */ +static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, + u32 lmax) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight); + + if (new_agg == NULL) { /* create new aggregate */ + new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC); + if (new_agg == NULL) + return -ENOBUFS; + qfq_init_agg(q, new_agg, lmax, weight); + } + qfq_deact_rm_from_agg(q, cl); + qfq_add_to_agg(q, new_agg, cl); + + return 0; +} static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)*arg; + bool existing = false; struct nlattr *tb[TCA_QFQ_MAX + 1]; + struct qfq_aggregate *new_agg = NULL; u32 weight, lmax, inv_w; int err; int delta_w; @@ -286,15 +424,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } else weight = 1; - inv_w = ONE_FP / weight; - weight = ONE_FP / inv_w; - delta_w = weight - (cl ? ONE_FP / cl->inv_w : 0); - if (q->wsum + delta_w > QFQ_MAX_WSUM) { - pr_notice("qfq: total weight out of range (%u + %u)\n", - delta_w, q->wsum); - return -EINVAL; - } - if (tb[TCA_QFQ_LMAX]) { lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { @@ -304,7 +433,23 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } else lmax = psched_mtu(qdisc_dev(sch)); - if (cl != NULL) { + inv_w = ONE_FP / weight; + weight = ONE_FP / inv_w; + + if (cl != NULL && + lmax == cl->agg->lmax && + weight == cl->agg->class_weight) + return 0; /* nothing to change */ + + delta_w = weight - (cl ? cl->agg->class_weight : 0); + + if (q->wsum + delta_w > QFQ_MAX_WSUM) { + pr_notice("qfq: total weight out of range (%d + %u)\n", + delta_w, q->wsum); + return -EINVAL; + } + + if (cl != NULL) { /* modify existing class */ if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, &cl->rate_est, qdisc_root_sleeping_lock(sch), @@ -312,25 +457,18 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (err) return err; } - - if (lmax == cl->lmax && inv_w == cl->inv_w) - return 0; /* nothing to update */ - - sch_tree_lock(sch); - qfq_update_reactivate_class(q, cl, inv_w, lmax, delta_w); - sch_tree_unlock(sch); - - return 0; + existing = true; + goto set_change_agg; } + /* create and init new class */ cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); if (cl == NULL) return -ENOBUFS; cl->refcnt = 1; cl->common.classid = classid; - - qfq_update_class_params(q, cl, lmax, inv_w, delta_w); + cl->deficit = lmax; cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); @@ -341,11 +479,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, err = gen_new_estimator(&cl->bstats, &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); - if (err) { - qdisc_destroy(cl->qdisc); - kfree(cl); - return err; - } + if (err) + goto destroy_class; } sch_tree_lock(sch); @@ -354,19 +489,39 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, qdisc_class_hash_grow(sch, &q->clhash); +set_change_agg: + sch_tree_lock(sch); + new_agg = qfq_find_agg(q, lmax, weight); + if (new_agg == NULL) { /* create new aggregate */ + sch_tree_unlock(sch); + new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL); + if (new_agg == NULL) { + err = -ENOBUFS; + gen_kill_estimator(&cl->bstats, &cl->rate_est); + goto destroy_class; + } + sch_tree_lock(sch); + qfq_init_agg(q, new_agg, lmax, weight); + } + if (existing) + qfq_deact_rm_from_agg(q, cl); + qfq_add_to_agg(q, new_agg, cl); + sch_tree_unlock(sch); + *arg = (unsigned long)cl; return 0; + +destroy_class: + qdisc_destroy(cl->qdisc); + kfree(cl); + return err; } static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) { struct qfq_sched *q = qdisc_priv(sch); - if (cl->inv_w) { - q->wsum -= ONE_FP / cl->inv_w; - cl->inv_w = 0; - } - + qfq_rm_from_agg(q, cl); gen_kill_estimator(&cl->bstats, &cl->rate_est); qdisc_destroy(cl->qdisc); kfree(cl); @@ -481,8 +636,8 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - if (nla_put_u32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w) || - nla_put_u32(skb, TCA_QFQ_LMAX, cl->lmax)) + if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) || + nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax)) goto nla_put_failure; return nla_nest_end(skb, nest); @@ -500,8 +655,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, memset(&xstats, 0, sizeof(xstats)); cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; - xstats.weight = ONE_FP/cl->inv_w; - xstats.lmax = cl->lmax; + xstats.weight = cl->agg->class_weight; + xstats.lmax = cl->agg->lmax; if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || @@ -652,16 +807,16 @@ static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F) * perhaps * old_V ^= q->V; - old_V >>= QFQ_MIN_SLOT_SHIFT; + old_V >>= q->min_slot_shift; if (old_V) { ... } * */ -static void qfq_make_eligible(struct qfq_sched *q, u64 old_V) +static void qfq_make_eligible(struct qfq_sched *q) { - unsigned long vslot = q->V >> QFQ_MIN_SLOT_SHIFT; - unsigned long old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; + unsigned long vslot = q->V >> q->min_slot_shift; + unsigned long old_vslot = q->oldV >> q->min_slot_shift; if (vslot != old_vslot) { unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1; @@ -672,34 +827,38 @@ static void qfq_make_eligible(struct qfq_sched *q, u64 old_V) /* - * If the weight and lmax (max_pkt_size) of the classes do not change, - * then QFQ guarantees that the slot index is never higher than - * 2 + ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM). + * The index of the slot in which the aggregate is to be inserted must + * not be higher than QFQ_MAX_SLOTS-2. There is a '-2' and not a '-1' + * because the start time of the group may be moved backward by one + * slot after the aggregate has been inserted, and this would cause + * non-empty slots to be right-shifted by one position. * - * With the current values of the above constants, the index is - * then guaranteed to never be higher than 2 + 256 * (1 / 16) = 18. + * If the weight and lmax (max_pkt_size) of the classes do not change, + * then QFQ+ does meet the above contraint according to the current + * values of its parameters. In fact, if the weight and lmax of the + * classes do not change, then, from the theory, QFQ+ guarantees that + * the slot index is never higher than + * 2 + QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * + * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM) = 2 + 8 * 128 * (1 / 64) = 18 * * When the weight of a class is increased or the lmax of the class is - * decreased, a new class with smaller slot size may happen to be - * activated. The activation of this class should be properly delayed - * to when the service of the class has finished in the ideal system - * tracked by QFQ. If the activation of the class is not delayed to - * this reference time instant, then this class may be unjustly served - * before other classes waiting for service. This may cause - * (unfrequently) the above bound to the slot index to be violated for - * some of these unlucky classes. + * decreased, a new aggregate with smaller slot size than the original + * parent aggregate of the class may happen to be activated. The + * activation of this aggregate should be properly delayed to when the + * service of the class has finished in the ideal system tracked by + * QFQ+. If the activation of the aggregate is not delayed to this + * reference time instant, then this aggregate may be unjustly served + * before other aggregates waiting for service. This may cause the + * above bound to the slot index to be violated for some of these + * unlucky aggregates. * - * Instead of delaying the activation of the new class, which is quite - * complex, the following inaccurate but simple solution is used: if - * the slot index is higher than QFQ_MAX_SLOTS-2, then the timestamps - * of the class are shifted backward so as to let the slot index - * become equal to QFQ_MAX_SLOTS-2. This threshold is used because, if - * the slot index is above it, then the data structure implementing - * the bucket list either gets immediately corrupted or may get - * corrupted on a possible next packet arrival that causes the start - * time of the group to be shifted backward. + * Instead of delaying the activation of the new aggregate, which is + * quite complex, the following inaccurate but simple solution is used: + * if the slot index is higher than QFQ_MAX_SLOTS-2, then the + * timestamps of the aggregate are shifted backward so as to let the + * slot index become equal to QFQ_MAX_SLOTS-2. */ -static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, +static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, u64 roundedS) { u64 slot = (roundedS - grp->S) >> grp->slot_shift; @@ -708,22 +867,22 @@ static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, if (unlikely(slot > QFQ_MAX_SLOTS - 2)) { u64 deltaS = roundedS - grp->S - ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift); - cl->S -= deltaS; - cl->F -= deltaS; + agg->S -= deltaS; + agg->F -= deltaS; slot = QFQ_MAX_SLOTS - 2; } i = (grp->front + slot) % QFQ_MAX_SLOTS; - hlist_add_head(&cl->next, &grp->slots[i]); + hlist_add_head(&agg->next, &grp->slots[i]); __set_bit(slot, &grp->full_slots); } /* Maybe introduce hlist_first_entry?? */ -static struct qfq_class *qfq_slot_head(struct qfq_group *grp) +static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp) { return hlist_entry(grp->slots[grp->front].first, - struct qfq_class, next); + struct qfq_aggregate, next); } /* @@ -731,20 +890,20 @@ static struct qfq_class *qfq_slot_head(struct qfq_group *grp) */ static void qfq_front_slot_remove(struct qfq_group *grp) { - struct qfq_class *cl = qfq_slot_head(grp); + struct qfq_aggregate *agg = qfq_slot_head(grp); - BUG_ON(!cl); - hlist_del(&cl->next); + BUG_ON(!agg); + hlist_del(&agg->next); if (hlist_empty(&grp->slots[grp->front])) __clear_bit(0, &grp->full_slots); } /* - * Returns the first full queue in a group. As a side effect, - * adjust the bucket list so the first non-empty bucket is at - * position 0 in full_slots. + * Returns the first aggregate in the first non-empty bucket of the + * group. As a side effect, adjusts the bucket list so the first + * non-empty bucket is at position 0 in full_slots. */ -static struct qfq_class *qfq_slot_scan(struct qfq_group *grp) +static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp) { unsigned int i; @@ -780,7 +939,7 @@ static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS) grp->front = (grp->front - i) % QFQ_MAX_SLOTS; } -static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) +static void qfq_update_eligible(struct qfq_sched *q) { struct qfq_group *grp; unsigned long ineligible; @@ -792,137 +951,226 @@ static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) if (qfq_gt(grp->S, q->V)) q->V = grp->S; } - qfq_make_eligible(q, old_V); + qfq_make_eligible(q); } } -/* - * Updates the class, returns true if also the group needs to be updated. - */ -static bool qfq_update_class(struct qfq_group *grp, struct qfq_class *cl) +/* Dequeue head packet of the head class in the DRR queue of the aggregate. */ +static void agg_dequeue(struct qfq_aggregate *agg, + struct qfq_class *cl, unsigned int len) { - unsigned int len = qdisc_peek_len(cl->qdisc); + qdisc_dequeue_peeked(cl->qdisc); - cl->S = cl->F; - if (!len) - qfq_front_slot_remove(grp); /* queue is empty */ - else { - u64 roundedS; + cl->deficit -= (int) len; - cl->F = cl->S + (u64)len * cl->inv_w; - roundedS = qfq_round_down(cl->S, grp->slot_shift); - if (roundedS == grp->S) - return false; - - qfq_front_slot_remove(grp); - qfq_slot_insert(grp, cl, roundedS); + if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ + list_del(&cl->alist); + else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { + cl->deficit += agg->lmax; + list_move_tail(&cl->alist, &agg->active); } +} + +static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg, + struct qfq_class **cl, + unsigned int *len) +{ + struct sk_buff *skb; - return true; + *cl = list_first_entry(&agg->active, struct qfq_class, alist); + skb = (*cl)->qdisc->ops->peek((*cl)->qdisc); + if (skb == NULL) + WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); + else + *len = qdisc_pkt_len(skb); + + return skb; +} + +/* Update F according to the actual service received by the aggregate. */ +static inline void charge_actual_service(struct qfq_aggregate *agg) +{ + /* compute the service received by the aggregate */ + u32 service_received = agg->initial_budget - agg->budget; + + agg->F = agg->S + (u64)service_received * agg->inv_w; } static struct sk_buff *qfq_dequeue(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); - struct qfq_group *grp; + struct qfq_aggregate *in_serv_agg = q->in_serv_agg; struct qfq_class *cl; - struct sk_buff *skb; - unsigned int len; - u64 old_V; + struct sk_buff *skb = NULL; + /* next-packet len, 0 means no more active classes in in-service agg */ + unsigned int len = 0; - if (!q->bitmaps[ER]) + if (in_serv_agg == NULL) return NULL; - grp = qfq_ffs(q, q->bitmaps[ER]); + if (!list_empty(&in_serv_agg->active)) + skb = qfq_peek_skb(in_serv_agg, &cl, &len); - cl = qfq_slot_head(grp); - skb = qdisc_dequeue_peeked(cl->qdisc); - if (!skb) { - WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); - return NULL; + /* + * If there are no active classes in the in-service aggregate, + * or if the aggregate has not enough budget to serve its next + * class, then choose the next aggregate to serve. + */ + if (len == 0 || in_serv_agg->budget < len) { + charge_actual_service(in_serv_agg); + + /* recharge the budget of the aggregate */ + in_serv_agg->initial_budget = in_serv_agg->budget = + in_serv_agg->budgetmax; + + if (!list_empty(&in_serv_agg->active)) + /* + * Still active: reschedule for + * service. Possible optimization: if no other + * aggregate is active, then there is no point + * in rescheduling this aggregate, and we can + * just keep it as the in-service one. This + * should be however a corner case, and to + * handle it, we would need to maintain an + * extra num_active_aggs field. + */ + qfq_activate_agg(q, in_serv_agg, requeue); + else if (sch->q.qlen == 0) { /* no aggregate to serve */ + q->in_serv_agg = NULL; + return NULL; + } + + /* + * If we get here, there are other aggregates queued: + * choose the new aggregate to serve. + */ + in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q); + skb = qfq_peek_skb(in_serv_agg, &cl, &len); } + if (!skb) + return NULL; sch->q.qlen--; qdisc_bstats_update(sch, skb); - old_V = q->V; - len = qdisc_pkt_len(skb); + agg_dequeue(in_serv_agg, cl, len); + in_serv_agg->budget -= len; q->V += (u64)len * IWSUM; pr_debug("qfq dequeue: len %u F %lld now %lld\n", - len, (unsigned long long) cl->F, (unsigned long long) q->V); + len, (unsigned long long) in_serv_agg->F, + (unsigned long long) q->V); - if (qfq_update_class(grp, cl)) { - u64 old_F = grp->F; + return skb; +} - cl = qfq_slot_scan(grp); - if (!cl) - __clear_bit(grp->index, &q->bitmaps[ER]); - else { - u64 roundedS = qfq_round_down(cl->S, grp->slot_shift); - unsigned int s; +static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) +{ + struct qfq_group *grp; + struct qfq_aggregate *agg, *new_front_agg; + u64 old_F; - if (grp->S == roundedS) - goto skip_unblock; - grp->S = roundedS; - grp->F = roundedS + (2ULL << grp->slot_shift); - __clear_bit(grp->index, &q->bitmaps[ER]); - s = qfq_calc_state(q, grp); - __set_bit(grp->index, &q->bitmaps[s]); - } + qfq_update_eligible(q); + q->oldV = q->V; + + if (!q->bitmaps[ER]) + return NULL; + + grp = qfq_ffs(q, q->bitmaps[ER]); + old_F = grp->F; + + agg = qfq_slot_head(grp); - qfq_unblock_groups(q, grp->index, old_F); + /* agg starts to be served, remove it from schedule */ + qfq_front_slot_remove(grp); + + new_front_agg = qfq_slot_scan(grp); + + if (new_front_agg == NULL) /* group is now inactive, remove from ER */ + __clear_bit(grp->index, &q->bitmaps[ER]); + else { + u64 roundedS = qfq_round_down(new_front_agg->S, + grp->slot_shift); + unsigned int s; + + if (grp->S == roundedS) + return agg; + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + __clear_bit(grp->index, &q->bitmaps[ER]); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); } -skip_unblock: - qfq_update_eligible(q, old_V); + qfq_unblock_groups(q, grp->index, old_F); - return skb; + return agg; } /* - * Assign a reasonable start time for a new flow k in group i. + * Assign a reasonable start time for a new aggregate in group i. * Admissible values for \hat(F) are multiples of \sigma_i * no greater than V+\sigma_i . Larger values mean that * we had a wraparound so we consider the timestamp to be stale. * * If F is not stale and F >= V then we set S = F. * Otherwise we should assign S = V, but this may violate - * the ordering in ER. So, if we have groups in ER, set S to - * the F_j of the first group j which would be blocking us. + * the ordering in EB (see [2]). So, if we have groups in ER, + * set S to the F_j of the first group j which would be blocking us. * We are guaranteed not to move S backward because * otherwise our group i would still be blocked. */ -static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) +static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) { unsigned long mask; u64 limit, roundedF; - int slot_shift = cl->grp->slot_shift; + int slot_shift = agg->grp->slot_shift; - roundedF = qfq_round_down(cl->F, slot_shift); + roundedF = qfq_round_down(agg->F, slot_shift); limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); - if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { + if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) { /* timestamp was stale */ - mask = mask_from(q->bitmaps[ER], cl->grp->index); + mask = mask_from(q->bitmaps[ER], agg->grp->index); if (mask) { struct qfq_group *next = qfq_ffs(q, mask); if (qfq_gt(roundedF, next->F)) { if (qfq_gt(limit, next->F)) - cl->S = next->F; + agg->S = next->F; else /* preserve timestamp correctness */ - cl->S = limit; + agg->S = limit; return; } } - cl->S = q->V; + agg->S = q->V; } else /* timestamp is not stale */ - cl->S = cl->F; + agg->S = agg->F; } +/* + * Update the timestamps of agg before scheduling/rescheduling it for + * service. In particular, assign to agg->F its maximum possible + * value, i.e., the virtual finish time with which the aggregate + * should be labeled if it used all its budget once in service. + */ +static inline void +qfq_update_agg_ts(struct qfq_sched *q, + struct qfq_aggregate *agg, enum update_reason reason) +{ + if (reason != requeue) + qfq_update_start(q, agg); + else /* just charge agg for the service received */ + agg->S = agg->F; + + agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w; +} + +static void qfq_schedule_agg(struct qfq_sched *, struct qfq_aggregate *); + static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; + struct qfq_aggregate *agg; int err = 0; cl = qfq_classify(skb, sch, &err); @@ -934,11 +1182,13 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) } pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); - if (unlikely(cl->lmax < qdisc_pkt_len(skb))) { + if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) { pr_debug("qfq: increasing maxpkt from %u to %u for class %u", - cl->lmax, qdisc_pkt_len(skb), cl->common.classid); - qfq_update_reactivate_class(q, cl, cl->inv_w, - qdisc_pkt_len(skb), 0); + cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid); + err = qfq_change_agg(sch, cl, cl->agg->class_weight, + qdisc_pkt_len(skb)); + if (err) + return err; } err = qdisc_enqueue(skb, cl->qdisc); @@ -954,35 +1204,50 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) bstats_update(&cl->bstats, skb); ++sch->q.qlen; - /* If the new skb is not the head of queue, then done here. */ - if (cl->qdisc->q.qlen != 1) + agg = cl->agg; + /* if the queue was not empty, then done here */ + if (cl->qdisc->q.qlen != 1) { + if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && + list_first_entry(&agg->active, struct qfq_class, alist) + == cl && cl->deficit < qdisc_pkt_len(skb)) + list_move_tail(&cl->alist, &agg->active); + return err; + } + + /* schedule class for service within the aggregate */ + cl->deficit = agg->lmax; + list_add_tail(&cl->alist, &agg->active); - /* If reach this point, queue q was idle */ - qfq_activate_class(q, cl, qdisc_pkt_len(skb)); + if (list_first_entry(&agg->active, struct qfq_class, alist) != cl) + return err; /* aggregate was not empty, nothing else to do */ + + /* recharge budget */ + agg->initial_budget = agg->budget = agg->budgetmax; + + qfq_update_agg_ts(q, agg, enqueue); + if (q->in_serv_agg == NULL) + q->in_serv_agg = agg; + else if (agg != q->in_serv_agg) + qfq_schedule_agg(q, agg); return err; } /* - * Handle class switch from idle to backlogged. + * Schedule aggregate according to its timestamps. */ -static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, - unsigned int pkt_len) +static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { - struct qfq_group *grp = cl->grp; + struct qfq_group *grp = agg->grp; u64 roundedS; int s; - qfq_update_start(q, cl); - - /* compute new finish time and rounded start. */ - cl->F = cl->S + (u64)pkt_len * cl->inv_w; - roundedS = qfq_round_down(cl->S, grp->slot_shift); + roundedS = qfq_round_down(agg->S, grp->slot_shift); /* - * insert cl in the correct bucket. - * If cl->S >= grp->S we don't need to adjust the + * Insert agg in the correct bucket. + * If agg->S >= grp->S we don't need to adjust the * bucket list and simply go to the insertion phase. * Otherwise grp->S is decreasing, we must make room * in the bucket list, and also recompute the group state. @@ -990,10 +1255,10 @@ static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, * was in ER make sure to adjust V. */ if (grp->full_slots) { - if (!qfq_gt(grp->S, cl->S)) + if (!qfq_gt(grp->S, agg->S)) goto skip_update; - /* create a slot for this cl->S */ + /* create a slot for this agg->S */ qfq_slot_rotate(grp, roundedS); /* group was surely ineligible, remove */ __clear_bit(grp->index, &q->bitmaps[IR]); @@ -1008,46 +1273,61 @@ static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", s, q->bitmaps[s], - (unsigned long long) cl->S, - (unsigned long long) cl->F, + (unsigned long long) agg->S, + (unsigned long long) agg->F, (unsigned long long) q->V); skip_update: - qfq_slot_insert(grp, cl, roundedS); + qfq_slot_insert(grp, agg, roundedS); } +/* Update agg ts and schedule agg for service */ +static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + enum update_reason reason) +{ + qfq_update_agg_ts(q, agg, reason); + qfq_schedule_agg(q, agg); +} + static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, - struct qfq_class *cl) + struct qfq_aggregate *agg) { unsigned int i, offset; u64 roundedS; - roundedS = qfq_round_down(cl->S, grp->slot_shift); + roundedS = qfq_round_down(agg->S, grp->slot_shift); offset = (roundedS - grp->S) >> grp->slot_shift; + i = (grp->front + offset) % QFQ_MAX_SLOTS; - hlist_del(&cl->next); + hlist_del(&agg->next); if (hlist_empty(&grp->slots[i])) __clear_bit(offset, &grp->full_slots); } /* - * called to forcibly destroy a queue. - * If the queue is not in the front bucket, or if it has - * other queues in the front bucket, we can simply remove - * the queue with no other side effects. + * Called to forcibly deschedule an aggregate. If the aggregate is + * not in the front bucket, or if the latter has other aggregates in + * the front bucket, we can simply remove the aggregate with no other + * side effects. * Otherwise we must propagate the event up. */ -static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) +static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { - struct qfq_group *grp = cl->grp; + struct qfq_group *grp = agg->grp; unsigned long mask; u64 roundedS; int s; - cl->F = cl->S; - qfq_slot_remove(q, grp, cl); + if (agg == q->in_serv_agg) { + charge_actual_service(agg); + q->in_serv_agg = qfq_choose_next_agg(q); + return; + } + + agg->F = agg->S; + qfq_slot_remove(q, grp, agg); if (!grp->full_slots) { __clear_bit(grp->index, &q->bitmaps[IR]); @@ -1066,8 +1346,8 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) } __clear_bit(grp->index, &q->bitmaps[ER]); } else if (hlist_empty(&grp->slots[grp->front])) { - cl = qfq_slot_scan(grp); - roundedS = qfq_round_down(cl->S, grp->slot_shift); + agg = qfq_slot_scan(grp); + roundedS = qfq_round_down(agg->S, grp->slot_shift); if (grp->S != roundedS) { __clear_bit(grp->index, &q->bitmaps[ER]); __clear_bit(grp->index, &q->bitmaps[IR]); @@ -1080,7 +1360,7 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) } } - qfq_update_eligible(q, q->V); + qfq_update_eligible(q); } static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) @@ -1092,6 +1372,32 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) qfq_deactivate_class(q, cl); } +static unsigned int qfq_drop_from_slot(struct qfq_sched *q, + struct hlist_head *slot) +{ + struct qfq_aggregate *agg; + struct hlist_node *n; + struct qfq_class *cl; + unsigned int len; + + hlist_for_each_entry(agg, n, slot, next) { + list_for_each_entry(cl, &agg->active, alist) { + + if (!cl->qdisc->ops->drop) + continue; + + len = cl->qdisc->ops->drop(cl->qdisc); + if (len > 0) { + if (cl->qdisc->q.qlen == 0) + qfq_deactivate_class(q, cl); + + return len; + } + } + } + return 0; +} + static unsigned int qfq_drop(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); @@ -1101,24 +1407,13 @@ static unsigned int qfq_drop(struct Qdisc *sch) for (i = 0; i <= QFQ_MAX_INDEX; i++) { grp = &q->groups[i]; for (j = 0; j < QFQ_MAX_SLOTS; j++) { - struct qfq_class *cl; - struct hlist_node *n; - - hlist_for_each_entry(cl, n, &grp->slots[j], next) { - - if (!cl->qdisc->ops->drop) - continue; - - len = cl->qdisc->ops->drop(cl->qdisc); - if (len > 0) { - sch->q.qlen--; - if (!cl->qdisc->q.qlen) - qfq_deactivate_class(q, cl); - - return len; - } + len = qfq_drop_from_slot(q, &grp->slots[j]); + if (len > 0) { + sch->q.qlen--; + return len; } } + } return 0; @@ -1129,44 +1424,51 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) struct qfq_sched *q = qdisc_priv(sch); struct qfq_group *grp; int i, j, err; + u32 max_cl_shift, maxbudg_shift, max_classes; err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; + if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES) + max_classes = QFQ_MAX_AGG_CLASSES; + else + max_classes = qdisc_dev(sch)->tx_queue_len + 1; + /* max_cl_shift = floor(log_2(max_classes)) */ + max_cl_shift = __fls(max_classes); + q->max_agg_classes = 1<<max_cl_shift; + + /* maxbudg_shift = log2(max_len * max_classes_per_agg) */ + maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift; + q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX; + for (i = 0; i <= QFQ_MAX_INDEX; i++) { grp = &q->groups[i]; grp->index = i; - grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - - (QFQ_MAX_INDEX - i); + grp->slot_shift = q->min_slot_shift + i; for (j = 0; j < QFQ_MAX_SLOTS; j++) INIT_HLIST_HEAD(&grp->slots[j]); } + INIT_HLIST_HEAD(&q->nonfull_aggs); + return 0; } static void qfq_reset_qdisc(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); - struct qfq_group *grp; struct qfq_class *cl; - struct hlist_node *n, *tmp; - unsigned int i, j; + struct hlist_node *n; + unsigned int i; - for (i = 0; i <= QFQ_MAX_INDEX; i++) { - grp = &q->groups[i]; - for (j = 0; j < QFQ_MAX_SLOTS; j++) { - hlist_for_each_entry_safe(cl, n, tmp, - &grp->slots[j], next) { + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + if (cl->qdisc->q.qlen > 0) qfq_deactivate_class(q, cl); - } - } - } - for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) qdisc_reset(cl->qdisc); + } } sch->q.qlen = 0; } |