diff options
Diffstat (limited to 'net/sched')
39 files changed, 22039 insertions, 0 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig new file mode 100644 index 00000000000..3d1d902dd1a --- /dev/null +++ b/net/sched/Kconfig @@ -0,0 +1,508 @@ +# +# Traffic control configuration. +# +choice + prompt "Packet scheduler clock source" + depends on NET_SCHED + default NET_SCH_CLK_JIFFIES + help + Packet schedulers need a monotonic clock that increments at a static + rate. The kernel provides several suitable interfaces, each with + different properties: + + - high resolution (us or better) + - fast to read (minimal locking, no i/o access) + - synchronized on all processors + - handles cpu clock frequency changes + + but nothing provides all of the above. + +config NET_SCH_CLK_JIFFIES + bool "Timer interrupt" + help + Say Y here if you want to use the timer interrupt (jiffies) as clock + source. This clock source is fast, synchronized on all processors and + handles cpu clock frequency changes, but its resolution is too low + for accurate shaping except at very low speed. + +config NET_SCH_CLK_GETTIMEOFDAY + bool "gettimeofday" + help + Say Y here if you want to use gettimeofday as clock source. This clock + source has high resolution, is synchronized on all processors and + handles cpu clock frequency changes, but it is slow. + + Choose this if you need a high resolution clock source but can't use + the CPU's cycle counter. + +config NET_SCH_CLK_CPU + bool "CPU cycle counter" + depends on X86_TSC || X86_64 || ALPHA || SPARC64 || PPC64 || IA64 + help + Say Y here if you want to use the CPU's cycle counter as clock source. + This is a cheap and high resolution clock source, but on some + architectures it is not synchronized on all processors and doesn't + handle cpu clock frequency changes. + + The useable cycle counters are: + + x86/x86_64 - Timestamp Counter + alpha - Cycle Counter + sparc64 - %ticks register + ppc64 - Time base + ia64 - Interval Time Counter + + Choose this if your CPU's cycle counter is working properly. + +endchoice + +config NET_SCH_CBQ + tristate "CBQ packet scheduler" + depends on NET_SCHED + ---help--- + Say Y here if you want to use the Class-Based Queueing (CBQ) packet + scheduling algorithm for some of your network devices. This + algorithm classifies the waiting packets into a tree-like hierarchy + of classes; the leaves of this tree are in turn scheduled by + separate algorithms (called "disciplines" in this context). + + See the top of <file:net/sched/sch_cbq.c> for references about the + CBQ algorithm. + + CBQ is a commonly used scheduler, so if you're unsure, you should + say Y here. Then say Y to all the queueing algorithms below that you + want to use as CBQ disciplines. Then say Y to "Packet classifier + API" and say Y to all the classifiers you want to use; a classifier + is a routine that allows you to sort your outgoing traffic into + classes based on a certain criterion. + + To compile this code as a module, choose M here: the + module will be called sch_cbq. + +config NET_SCH_HTB + tristate "HTB packet scheduler" + depends on NET_SCHED + ---help--- + Say Y here if you want to use the Hierarchical Token Buckets (HTB) + packet scheduling algorithm for some of your network devices. See + <http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and + in-depth articles. + + HTB is very similar to the CBQ regarding its goals however is has + different properties and different algorithm. + + To compile this code as a module, choose M here: the + module will be called sch_htb. + +config NET_SCH_HFSC + tristate "HFSC packet scheduler" + depends on NET_SCHED + ---help--- + Say Y here if you want to use the Hierarchical Fair Service Curve + (HFSC) packet scheduling algorithm for some of your network devices. + + To compile this code as a module, choose M here: the + module will be called sch_hfsc. + +#tristate ' H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ +config NET_SCH_ATM + tristate "ATM pseudo-scheduler" + depends on NET_SCHED && ATM + ---help--- + Say Y here if you want to use the ATM pseudo-scheduler. This + provides a framework for invoking classifiers (aka "filters"), which + in turn select classes of this queuing discipline. Each class maps + the flow(s) it is handling to a given virtual circuit (see the top of + <file:net/sched/sch_atm.c>). + + To compile this code as a module, choose M here: the + module will be called sch_atm. + +config NET_SCH_PRIO + tristate "The simplest PRIO pseudoscheduler" + depends on NET_SCHED + help + Say Y here if you want to use an n-band priority queue packet + "scheduler" for some of your network devices or as a leaf discipline + for the CBQ scheduling algorithm. If unsure, say Y. + + To compile this code as a module, choose M here: the + module will be called sch_prio. + +config NET_SCH_RED + tristate "RED queue" + depends on NET_SCHED + help + Say Y here if you want to use the Random Early Detection (RED) + packet scheduling algorithm for some of your network devices (see + the top of <file:net/sched/sch_red.c> for details and references + about the algorithm). + + To compile this code as a module, choose M here: the + module will be called sch_red. + +config NET_SCH_SFQ + tristate "SFQ queue" + depends on NET_SCHED + ---help--- + Say Y here if you want to use the Stochastic Fairness Queueing (SFQ) + packet scheduling algorithm for some of your network devices or as a + leaf discipline for the CBQ scheduling algorithm (see the top of + <file:net/sched/sch_sfq.c> for details and references about the SFQ + algorithm). + + To compile this code as a module, choose M here: the + module will be called sch_sfq. + +config NET_SCH_TEQL + tristate "TEQL queue" + depends on NET_SCHED + ---help--- + Say Y here if you want to use the True Link Equalizer (TLE) packet + scheduling algorithm for some of your network devices or as a leaf + discipline for the CBQ scheduling algorithm. This queueing + discipline allows the combination of several physical devices into + one virtual device. (see the top of <file:net/sched/sch_teql.c> for + details). + + To compile this code as a module, choose M here: the + module will be called sch_teql. + +config NET_SCH_TBF + tristate "TBF queue" + depends on NET_SCHED + help + Say Y here if you want to use the Simple Token Bucket Filter (TBF) + packet scheduling algorithm for some of your network devices or as a + leaf discipline for the CBQ scheduling algorithm (see the top of + <file:net/sched/sch_tbf.c> for a description of the TBF algorithm). + + To compile this code as a module, choose M here: the + module will be called sch_tbf. + +config NET_SCH_GRED + tristate "GRED queue" + depends on NET_SCHED + help + Say Y here if you want to use the Generic Random Early Detection + (RED) packet scheduling algorithm for some of your network devices + (see the top of <file:net/sched/sch_red.c> for details and + references about the algorithm). + + To compile this code as a module, choose M here: the + module will be called sch_gred. + +config NET_SCH_DSMARK + tristate "Diffserv field marker" + depends on NET_SCHED + help + Say Y if you want to schedule packets according to the + Differentiated Services architecture proposed in RFC 2475. + Technical information on this method, with pointers to associated + RFCs, is available at <http://www.gta.ufrj.br/diffserv/>. + + To compile this code as a module, choose M here: the + module will be called sch_dsmark. + +config NET_SCH_NETEM + tristate "Network emulator" + depends on NET_SCHED + help + Say Y if you want to emulate network delay, loss, and packet + re-ordering. This is often useful to simulate networks when + testing applications or protocols. + + To compile this driver as a module, choose M here: the module + will be called sch_netem. + + If unsure, say N. + +config NET_SCH_INGRESS + tristate "Ingress Qdisc" + depends on NET_SCHED + help + If you say Y here, you will be able to police incoming bandwidth + and drop packets when this bandwidth exceeds your desired rate. + If unsure, say Y. + + To compile this code as a module, choose M here: the + module will be called sch_ingress. + +config NET_QOS + bool "QoS support" + depends on NET_SCHED + ---help--- + Say Y here if you want to include Quality Of Service scheduling + features, which means that you will be able to request certain + rate-of-flow limits for your network devices. + + This Quality of Service (QoS) support will enable you to use + Differentiated Services (diffserv) and Resource Reservation Protocol + (RSVP) on your Linux router if you also say Y to "Packet classifier + API" and to some classifiers below. Documentation and software is at + <http://diffserv.sourceforge.net/>. + + Note that the answer to this question won't directly affect the + kernel: saying N will just cause the configurator to skip all + the questions about QoS support. + +config NET_ESTIMATOR + bool "Rate estimator" + depends on NET_QOS + help + In order for Quality of Service scheduling to work, the current + rate-of-flow for a network device has to be estimated; if you say Y + here, the kernel will do just that. + +config NET_CLS + bool "Packet classifier API" + depends on NET_SCHED + ---help--- + The CBQ scheduling algorithm requires that network packets which are + scheduled to be sent out over a network device be classified + according to some criterion. If you say Y here, you will get a + choice of several different packet classifiers with the following + questions. + + This will enable you to use Differentiated Services (diffserv) and + Resource Reservation Protocol (RSVP) on your Linux router. + Documentation and software is at + <http://diffserv.sourceforge.net/>. + +config NET_CLS_BASIC + tristate "Basic classifier" + depends on NET_CLS + ---help--- + Say Y here if you want to be able to classify packets using + only extended matches and actions. + + To compile this code as a module, choose M here: the + module will be called cls_basic. + +config NET_CLS_TCINDEX + tristate "TC index classifier" + depends on NET_CLS + help + If you say Y here, you will be able to classify outgoing packets + according to the tc_index field of the skb. You will want this + feature if you want to implement Differentiated Services using + sch_dsmark. If unsure, say Y. + + To compile this code as a module, choose M here: the + module will be called cls_tcindex. + +config NET_CLS_ROUTE4 + tristate "Routing table based classifier" + depends on NET_CLS + select NET_CLS_ROUTE + help + If you say Y here, you will be able to classify outgoing packets + according to the route table entry they matched. If unsure, say Y. + + To compile this code as a module, choose M here: the + module will be called cls_route. + +config NET_CLS_ROUTE + bool + default n + +config NET_CLS_FW + tristate "Firewall based classifier" + depends on NET_CLS + help + If you say Y here, you will be able to classify outgoing packets + according to firewall criteria you specified. + + To compile this code as a module, choose M here: the + module will be called cls_fw. + +config NET_CLS_U32 + tristate "U32 classifier" + depends on NET_CLS + help + If you say Y here, you will be able to classify outgoing packets + according to their destination address. If unsure, say Y. + + To compile this code as a module, choose M here: the + module will be called cls_u32. + +config CLS_U32_PERF + bool "U32 classifier performance counters" + depends on NET_CLS_U32 + help + gathers stats that could be used to tune u32 classifier performance. + Requires a new iproute2 + You MUST NOT turn this on if you dont have an update iproute2. + +config NET_CLS_IND + bool "classify input device (slows things u32/fw) " + depends on NET_CLS_U32 || NET_CLS_FW + help + This option will be killed eventually when a + metadata action appears because it slows things a little + Available only for u32 and fw classifiers. + Requires a new iproute2 + You MUST NOT turn this on if you dont have an update iproute2. + +config CLS_U32_MARK + bool "Use nfmark as a key in U32 classifier" + depends on NET_CLS_U32 && NETFILTER + help + This allows you to match mark in a u32 filter. + Example: + tc filter add dev eth0 protocol ip parent 1:0 prio 5 u32 \ + match mark 0x0090 0xffff \ + match ip dst 4.4.4.4 \ + flowid 1:90 + You must use a new iproute2 to use this feature. + +config NET_CLS_RSVP + tristate "Special RSVP classifier" + depends on NET_CLS && NET_QOS + ---help--- + The Resource Reservation Protocol (RSVP) permits end systems to + request a minimum and maximum data flow rate for a connection; this + is important for real time data such as streaming sound or video. + + Say Y here if you want to be able to classify outgoing packets based + on their RSVP requests. + + To compile this code as a module, choose M here: the + module will be called cls_rsvp. + +config NET_CLS_RSVP6 + tristate "Special RSVP classifier for IPv6" + depends on NET_CLS && NET_QOS + ---help--- + The Resource Reservation Protocol (RSVP) permits end systems to + request a minimum and maximum data flow rate for a connection; this + is important for real time data such as streaming sound or video. + + Say Y here if you want to be able to classify outgoing packets based + on their RSVP requests and you are using the new Internet Protocol + IPv6 as opposed to the older and more common IPv4. + + To compile this code as a module, choose M here: the + module will be called cls_rsvp6. + +config NET_EMATCH + bool "Extended Matches" + depends on NET_CLS + ---help--- + Say Y here if you want to use extended matches on top of classifiers + and select the extended matches below. + + Extended matches are small classification helpers not worth writing + a separate classifier. + + You must have a recent version of the iproute2 tools in order to use + extended matches. + +config NET_EMATCH_STACK + int "Stack size" + depends on NET_EMATCH + default "32" + ---help--- + Size of the local stack variable used while evaluating the tree of + ematches. Limits the depth of the tree, i.e. the number of + encapsulated precedences. Every level requires 4 bytes of addtional + stack space. + +config NET_EMATCH_CMP + tristate "Simple packet data comparison" + depends on NET_EMATCH + ---help--- + Say Y here if you want to be able to classify packets based on + simple packet data comparisons for 8, 16, and 32bit values. + + To compile this code as a module, choose M here: the + module will be called em_cmp. + +config NET_EMATCH_NBYTE + tristate "Multi byte comparison" + depends on NET_EMATCH + ---help--- + Say Y here if you want to be able to classify packets based on + multiple byte comparisons mainly useful for IPv6 address comparisons. + + To compile this code as a module, choose M here: the + module will be called em_nbyte. + +config NET_EMATCH_U32 + tristate "U32 hashing key" + depends on NET_EMATCH + ---help--- + Say Y here if you want to be able to classify packets using + the famous u32 key in combination with logic relations. + + To compile this code as a module, choose M here: the + module will be called em_u32. + +config NET_EMATCH_META + tristate "Metadata" + depends on NET_EMATCH + ---help--- + Say Y here if you want to be ablt to classify packets based on + metadata such as load average, netfilter attributes, socket + attributes and routing decisions. + + To compile this code as a module, choose M here: the + module will be called em_meta. + +config NET_CLS_ACT + bool "Packet ACTION" + depends on EXPERIMENTAL && NET_CLS && NET_QOS + ---help--- + This option requires you have a new iproute2. It enables + tc extensions which can be used with tc classifiers. + You MUST NOT turn this on if you dont have an update iproute2. + +config NET_ACT_POLICE + tristate "Policing Actions" + depends on NET_CLS_ACT + ---help--- + If you are using a newer iproute2 select this one, otherwise use one + below to select a policer. + You MUST NOT turn this on if you dont have an update iproute2. + +config NET_ACT_GACT + tristate "generic Actions" + depends on NET_CLS_ACT + ---help--- + You must have new iproute2 to use this feature. + This adds simple filtering actions like drop, accept etc. + +config GACT_PROB + bool "generic Actions probability" + depends on NET_ACT_GACT + ---help--- + Allows generic actions to be randomly or deterministically used. + +config NET_ACT_MIRRED + tristate "Packet In/Egress redirecton/mirror Actions" + depends on NET_CLS_ACT + ---help--- + requires new iproute2 + This allows packets to be mirrored or redirected to netdevices + +config NET_ACT_IPT + tristate "iptables Actions" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + ---help--- + requires new iproute2 + This allows iptables targets to be used by tc filters + +config NET_ACT_PEDIT + tristate "Generic Packet Editor Actions" + depends on NET_CLS_ACT + ---help--- + requires new iproute2 + This allows for packets to be generically edited + +config NET_CLS_POLICE + bool "Traffic policing (needed for in/egress)" + depends on NET_CLS && NET_QOS && NET_CLS_ACT!=y + help + Say Y to support traffic policing (bandwidth limits). Needed for + ingress and egress rate limiting. + diff --git a/net/sched/Makefile b/net/sched/Makefile new file mode 100644 index 00000000000..431e55786ef --- /dev/null +++ b/net/sched/Makefile @@ -0,0 +1,41 @@ +# +# Makefile for the Linux Traffic Control Unit. +# + +obj-y := sch_generic.o + +obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o +obj-$(CONFIG_NET_CLS) += cls_api.o +obj-$(CONFIG_NET_CLS_ACT) += act_api.o +obj-$(CONFIG_NET_ACT_POLICE) += police.o +obj-$(CONFIG_NET_CLS_POLICE) += police.o +obj-$(CONFIG_NET_ACT_GACT) += gact.o +obj-$(CONFIG_NET_ACT_MIRRED) += mirred.o +obj-$(CONFIG_NET_ACT_IPT) += ipt.o +obj-$(CONFIG_NET_ACT_PEDIT) += pedit.o +obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o +obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o +obj-$(CONFIG_NET_SCH_HPFQ) += sch_hpfq.o +obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o +obj-$(CONFIG_NET_SCH_RED) += sch_red.o +obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o +obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o +obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o +obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o +obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o +obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o +obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o +obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o +obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o +obj-$(CONFIG_NET_CLS_U32) += cls_u32.o +obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o +obj-$(CONFIG_NET_CLS_FW) += cls_fw.o +obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o +obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o +obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o +obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o +obj-$(CONFIG_NET_EMATCH) += ematch.o +obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o +obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o +obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o +obj-$(CONFIG_NET_EMATCH_META) += em_meta.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c new file mode 100644 index 00000000000..5e6cc371b39 --- /dev/null +++ b/net/sched/act_api.c @@ -0,0 +1,894 @@ +/* + * net/sched/act_api.c Packet action API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Author: Jamal Hadi Salim + * + * + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <net/sock.h> +#include <net/sch_generic.h> +#include <net/act_api.h> + +#if 1 /* control */ +#define DPRINTK(format, args...) printk(KERN_DEBUG format, ##args) +#else +#define DPRINTK(format, args...) +#endif +#if 0 /* data */ +#define D2PRINTK(format, args...) printk(KERN_DEBUG format, ##args) +#else +#define D2PRINTK(format, args...) +#endif + +static struct tc_action_ops *act_base = NULL; +static DEFINE_RWLOCK(act_mod_lock); + +int tcf_register_action(struct tc_action_ops *act) +{ + struct tc_action_ops *a, **ap; + + write_lock(&act_mod_lock); + for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) { + if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { + write_unlock(&act_mod_lock); + return -EEXIST; + } + } + act->next = NULL; + *ap = act; + write_unlock(&act_mod_lock); + return 0; +} + +int tcf_unregister_action(struct tc_action_ops *act) +{ + struct tc_action_ops *a, **ap; + int err = -ENOENT; + + write_lock(&act_mod_lock); + for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) + if (a == act) + break; + if (a) { + *ap = a->next; + a->next = NULL; + err = 0; + } + write_unlock(&act_mod_lock); + return err; +} + +/* lookup by name */ +static struct tc_action_ops *tc_lookup_action_n(char *kind) +{ + struct tc_action_ops *a = NULL; + + if (kind) { + read_lock(&act_mod_lock); + for (a = act_base; a; a = a->next) { + if (strcmp(kind, a->kind) == 0) { + if (!try_module_get(a->owner)) { + read_unlock(&act_mod_lock); + return NULL; + } + break; + } + } + read_unlock(&act_mod_lock); + } + return a; +} + +/* lookup by rtattr */ +static struct tc_action_ops *tc_lookup_action(struct rtattr *kind) +{ + struct tc_action_ops *a = NULL; + + if (kind) { + read_lock(&act_mod_lock); + for (a = act_base; a; a = a->next) { + if (rtattr_strcmp(kind, a->kind) == 0) { + if (!try_module_get(a->owner)) { + read_unlock(&act_mod_lock); + return NULL; + } + break; + } + } + read_unlock(&act_mod_lock); + } + return a; +} + +#if 0 +/* lookup by id */ +static struct tc_action_ops *tc_lookup_action_id(u32 type) +{ + struct tc_action_ops *a = NULL; + + if (type) { + read_lock(&act_mod_lock); + for (a = act_base; a; a = a->next) { + if (a->type == type) { + if (!try_module_get(a->owner)) { + read_unlock(&act_mod_lock); + return NULL; + } + break; + } + } + read_unlock(&act_mod_lock); + } + return a; +} +#endif + +int tcf_action_exec(struct sk_buff *skb, struct tc_action *act, + struct tcf_result *res) +{ + struct tc_action *a; + int ret = -1; + + if (skb->tc_verd & TC_NCLS) { + skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); + D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %s out %s\n", + skb, skb->input_dev ? skb->input_dev->name : "xxx", + skb->dev->name); + ret = TC_ACT_OK; + goto exec_done; + } + while ((a = act) != NULL) { +repeat: + if (a->ops && a->ops->act) { + ret = a->ops->act(&skb, a); + if (TC_MUNGED & skb->tc_verd) { + /* copied already, allow trampling */ + skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); + skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); + } + if (ret != TC_ACT_PIPE) + goto exec_done; + if (ret == TC_ACT_REPEAT) + goto repeat; /* we need a ttl - JHS */ + } + act = a->next; + } +exec_done: + if (skb->tc_classid > 0) { + res->classid = skb->tc_classid; + res->class = 0; + skb->tc_classid = 0; + } + return ret; +} + +void tcf_action_destroy(struct tc_action *act, int bind) +{ + struct tc_action *a; + + for (a = act; a; a = act) { + if (a->ops && a->ops->cleanup) { + DPRINTK("tcf_action_destroy destroying %p next %p\n", + a, a->next); + if (a->ops->cleanup(a, bind) == ACT_P_DELETED) + module_put(a->ops->owner); + act = act->next; + kfree(a); + } else { /*FIXME: Remove later - catch insertion bugs*/ + printk("tcf_action_destroy: BUG? destroying NULL ops\n"); + act = act->next; + kfree(a); + } + } +} + +int +tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + int err = -EINVAL; + + if (a->ops == NULL || a->ops->dump == NULL) + return err; + return a->ops->dump(skb, a, bind, ref); +} + +int +tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + int err = -EINVAL; + unsigned char *b = skb->tail; + struct rtattr *r; + + if (a->ops == NULL || a->ops->dump == NULL) + return err; + + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, a->ops->kind); + if (tcf_action_copy_stats(skb, a, 0)) + goto rtattr_failure; + r = (struct rtattr*) skb->tail; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) { + r->rta_len = skb->tail - (u8*)r; + return err; + } + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +int +tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) +{ + struct tc_action *a; + int err = -EINVAL; + unsigned char *b = skb->tail; + struct rtattr *r ; + + while ((a = act) != NULL) { + r = (struct rtattr*) skb->tail; + act = a->next; + RTA_PUT(skb, a->order, 0, NULL); + err = tcf_action_dump_1(skb, a, bind, ref); + if (err < 0) + goto rtattr_failure; + r->rta_len = skb->tail - (u8*)r; + } + + return 0; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -err; +} + +struct tc_action *tcf_action_init_1(struct rtattr *rta, struct rtattr *est, + char *name, int ovr, int bind, int *err) +{ + struct tc_action *a; + struct tc_action_ops *a_o; + char act_name[IFNAMSIZ]; + struct rtattr *tb[TCA_ACT_MAX+1]; + struct rtattr *kind; + + *err = -EINVAL; + + if (name == NULL) { + if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) + goto err_out; + kind = tb[TCA_ACT_KIND-1]; + if (kind == NULL) + goto err_out; + if (rtattr_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) + goto err_out; + } else { + if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) + goto err_out; + } + + a_o = tc_lookup_action_n(act_name); + if (a_o == NULL) { +#ifdef CONFIG_KMOD + rtnl_unlock(); + request_module(act_name); + rtnl_lock(); + + a_o = tc_lookup_action_n(act_name); + + /* We dropped the RTNL semaphore in order to + * perform the module load. So, even if we + * succeeded in loading the module we have to + * tell the caller to replay the request. We + * indicate this using -EAGAIN. + */ + if (a_o != NULL) { + *err = -EAGAIN; + goto err_mod; + } +#endif + goto err_out; + } + + *err = -ENOMEM; + a = kmalloc(sizeof(*a), GFP_KERNEL); + if (a == NULL) + goto err_mod; + memset(a, 0, sizeof(*a)); + + /* backward compatibility for policer */ + if (name == NULL) + *err = a_o->init(tb[TCA_ACT_OPTIONS-1], est, a, ovr, bind); + else + *err = a_o->init(rta, est, a, ovr, bind); + if (*err < 0) + goto err_free; + + /* module count goes up only when brand new policy is created + if it exists and is only bound to in a_o->init() then + ACT_P_CREATED is not returned (a zero is). + */ + if (*err != ACT_P_CREATED) + module_put(a_o->owner); + a->ops = a_o; + DPRINTK("tcf_action_init_1: successfull %s\n", act_name); + + *err = 0; + return a; + +err_free: + kfree(a); +err_mod: + module_put(a_o->owner); +err_out: + return NULL; +} + +struct tc_action *tcf_action_init(struct rtattr *rta, struct rtattr *est, + char *name, int ovr, int bind, int *err) +{ + struct rtattr *tb[TCA_ACT_MAX_PRIO+1]; + struct tc_action *head = NULL, *act, *act_prev = NULL; + int i; + + if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0) { + *err = -EINVAL; + return head; + } + + for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) { + act = tcf_action_init_1(tb[i], est, name, ovr, bind, err); + if (act == NULL) + goto err; + act->order = i+1; + + if (head == NULL) + head = act; + else + act_prev->next = act; + act_prev = act; + } + return head; + +err: + if (head != NULL) + tcf_action_destroy(head, bind); + return NULL; +} + +int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, + int compat_mode) +{ + int err = 0; + struct gnet_dump d; + struct tcf_act_hdr *h = a->priv; + + if (h == NULL) + goto errout; + + /* compat_mode being true specifies a call that is supposed + * to add additional backward compatiblity statistic TLVs. + */ + if (compat_mode) { + if (a->type == TCA_OLD_COMPAT) + err = gnet_stats_start_copy_compat(skb, 0, + TCA_STATS, TCA_XSTATS, h->stats_lock, &d); + else + return 0; + } else + err = gnet_stats_start_copy(skb, TCA_ACT_STATS, + h->stats_lock, &d); + + if (err < 0) + goto errout; + + if (a->ops != NULL && a->ops->get_stats != NULL) + if (a->ops->get_stats(skb, a) < 0) + goto errout; + + if (gnet_stats_copy_basic(&d, &h->bstats) < 0 || +#ifdef CONFIG_NET_ESTIMATOR + gnet_stats_copy_rate_est(&d, &h->rate_est) < 0 || +#endif + gnet_stats_copy_queue(&d, &h->qstats) < 0) + goto errout; + + if (gnet_stats_finish_copy(&d) < 0) + goto errout; + + return 0; + +errout: + return -1; +} + +static int +tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, + unsigned flags, int event, int bind, int ref) +{ + struct tcamsg *t; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rtattr *x; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t)); + nlh->nlmsg_flags = flags; + t = NLMSG_DATA(nlh); + t->tca_family = AF_UNSPEC; + + x = (struct rtattr*) skb->tail; + RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + + if (tcf_action_dump(skb, a, bind, ref) < 0) + goto rtattr_failure; + + x->rta_len = skb->tail - (u8*)x; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +rtattr_failure: +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +act_get_notify(u32 pid, struct nlmsghdr *n, struct tc_action *a, int event) +{ + struct sk_buff *skb; + int err = 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); + if (err > 0) + err = 0; + return err; +} + +static struct tc_action * +tcf_action_get_1(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int *err) +{ + struct rtattr *tb[TCA_ACT_MAX+1]; + struct tc_action *a; + int index; + + *err = -EINVAL; + if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) + return NULL; + + if (tb[TCA_ACT_INDEX - 1] == NULL || + RTA_PAYLOAD(tb[TCA_ACT_INDEX - 1]) < sizeof(index)) + return NULL; + index = *(int *)RTA_DATA(tb[TCA_ACT_INDEX - 1]); + + *err = -ENOMEM; + a = kmalloc(sizeof(struct tc_action), GFP_KERNEL); + if (a == NULL) + return NULL; + memset(a, 0, sizeof(struct tc_action)); + + *err = -EINVAL; + a->ops = tc_lookup_action(tb[TCA_ACT_KIND - 1]); + if (a->ops == NULL) + goto err_free; + if (a->ops->lookup == NULL) + goto err_mod; + *err = -ENOENT; + if (a->ops->lookup(a, index) == 0) + goto err_mod; + + module_put(a->ops->owner); + *err = 0; + return a; +err_mod: + module_put(a->ops->owner); +err_free: + kfree(a); + return NULL; +} + +static void cleanup_a(struct tc_action *act) +{ + struct tc_action *a; + + for (a = act; a; a = act) { + act = a->next; + kfree(a); + } +} + +static struct tc_action *create_a(int i) +{ + struct tc_action *act; + + act = kmalloc(sizeof(*act), GFP_KERNEL); + if (act == NULL) { + printk("create_a: failed to alloc!\n"); + return NULL; + } + memset(act, 0, sizeof(*act)); + act->order = i; + return act; +} + +static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid) +{ + struct sk_buff *skb; + unsigned char *b; + struct nlmsghdr *nlh; + struct tcamsg *t; + struct netlink_callback dcb; + struct rtattr *x; + struct rtattr *tb[TCA_ACT_MAX+1]; + struct rtattr *kind; + struct tc_action *a = create_a(0); + int err = -EINVAL; + + if (a == NULL) { + printk("tca_action_flush: couldnt create tc_action\n"); + return err; + } + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) { + printk("tca_action_flush: failed skb alloc\n"); + kfree(a); + return -ENOBUFS; + } + + b = (unsigned char *)skb->tail; + + if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) + goto err_out; + + kind = tb[TCA_ACT_KIND-1]; + a->ops = tc_lookup_action(kind); + if (a->ops == NULL) + goto err_out; + + nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t)); + t = NLMSG_DATA(nlh); + t->tca_family = AF_UNSPEC; + + x = (struct rtattr *) skb->tail; + RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + + err = a->ops->walk(skb, &dcb, RTM_DELACTION, a); + if (err < 0) + goto rtattr_failure; + + x->rta_len = skb->tail - (u8 *) x; + + nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_flags |= NLM_F_ROOT; + module_put(a->ops->owner); + kfree(a); + err = rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + if (err > 0) + return 0; + + return err; + +rtattr_failure: + module_put(a->ops->owner); +nlmsg_failure: +err_out: + kfree_skb(skb); + kfree(a); + return err; +} + +static int +tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event) +{ + int i, ret = 0; + struct rtattr *tb[TCA_ACT_MAX_PRIO+1]; + struct tc_action *head = NULL, *act, *act_prev = NULL; + + if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0) + return -EINVAL; + + if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) { + if (tb[0] != NULL && tb[1] == NULL) + return tca_action_flush(tb[0], n, pid); + } + + for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) { + act = tcf_action_get_1(tb[i], n, pid, &ret); + if (act == NULL) + goto err; + act->order = i+1; + + if (head == NULL) + head = act; + else + act_prev->next = act; + act_prev = act; + } + + if (event == RTM_GETACTION) + ret = act_get_notify(pid, n, head, event); + else { /* delete */ + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) { + ret = -ENOBUFS; + goto err; + } + + if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event, + 0, 1) <= 0) { + kfree_skb(skb); + ret = -EINVAL; + goto err; + } + + /* now do the delete */ + tcf_action_destroy(head, 0); + ret = rtnetlink_send(skb, pid, RTMGRP_TC, + n->nlmsg_flags&NLM_F_ECHO); + if (ret > 0) + return 0; + return ret; + } +err: + cleanup_a(head); + return ret; +} + +static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event, + unsigned flags) +{ + struct tcamsg *t; + struct nlmsghdr *nlh; + struct sk_buff *skb; + struct rtattr *x; + unsigned char *b; + int err = 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + b = (unsigned char *)skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t)); + nlh->nlmsg_flags = flags; + t = NLMSG_DATA(nlh); + t->tca_family = AF_UNSPEC; + + x = (struct rtattr*) skb->tail; + RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + + if (tcf_action_dump(skb, a, 0, 0) < 0) + goto rtattr_failure; + + x->rta_len = skb->tail - (u8*)x; + + nlh->nlmsg_len = skb->tail - b; + NETLINK_CB(skb).dst_groups = RTMGRP_TC; + + err = rtnetlink_send(skb, pid, RTMGRP_TC, flags&NLM_F_ECHO); + if (err > 0) + err = 0; + return err; + +rtattr_failure: +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + + +static int +tcf_action_add(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int ovr) +{ + int ret = 0; + struct tc_action *act; + struct tc_action *a; + u32 seq = n->nlmsg_seq; + + act = tcf_action_init(rta, NULL, NULL, ovr, 0, &ret); + if (act == NULL) + goto done; + + /* dump then free all the actions after update; inserted policy + * stays intact + * */ + ret = tcf_add_notify(act, pid, seq, RTM_NEWACTION, n->nlmsg_flags); + for (a = act; a; a = act) { + act = a->next; + kfree(a); + } +done: + return ret; +} + +static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct rtattr **tca = arg; + u32 pid = skb ? NETLINK_CB(skb).pid : 0; + int ret = 0, ovr = 0; + + if (tca[TCA_ACT_TAB-1] == NULL) { + printk("tc_ctl_action: received NO action attribs\n"); + return -EINVAL; + } + + /* n->nlmsg_flags&NLM_F_CREATE + * */ + switch (n->nlmsg_type) { + case RTM_NEWACTION: + /* we are going to assume all other flags + * imply create only if it doesnt exist + * Note that CREATE | EXCL implies that + * but since we want avoid ambiguity (eg when flags + * is zero) then just set this + */ + if (n->nlmsg_flags&NLM_F_REPLACE) + ovr = 1; +replay: + ret = tcf_action_add(tca[TCA_ACT_TAB-1], n, pid, ovr); + if (ret == -EAGAIN) + goto replay; + break; + case RTM_DELACTION: + ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_DELACTION); + break; + case RTM_GETACTION: + ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_GETACTION); + break; + default: + BUG(); + } + + return ret; +} + +static char * +find_dump_kind(struct nlmsghdr *n) +{ + struct rtattr *tb1, *tb2[TCA_ACT_MAX+1]; + struct rtattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct rtattr *rta[TCAA_MAX + 1]; + struct rtattr *kind; + int min_len = NLMSG_LENGTH(sizeof(struct tcamsg)); + int attrlen = n->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *attr = (void *) n + NLMSG_ALIGN(min_len); + + if (rtattr_parse(rta, TCAA_MAX, attr, attrlen) < 0) + return NULL; + tb1 = rta[TCA_ACT_TAB - 1]; + if (tb1 == NULL) + return NULL; + + if (rtattr_parse(tb, TCA_ACT_MAX_PRIO, RTA_DATA(tb1), + NLMSG_ALIGN(RTA_PAYLOAD(tb1))) < 0) + return NULL; + if (tb[0] == NULL) + return NULL; + + if (rtattr_parse(tb2, TCA_ACT_MAX, RTA_DATA(tb[0]), + RTA_PAYLOAD(tb[0])) < 0) + return NULL; + kind = tb2[TCA_ACT_KIND-1]; + + return (char *) RTA_DATA(kind); +} + +static int +tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rtattr *x; + struct tc_action_ops *a_o; + struct tc_action a; + int ret = 0; + struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh); + char *kind = find_dump_kind(cb->nlh); + + if (kind == NULL) { + printk("tc_dump_action: action bad kind\n"); + return 0; + } + + a_o = tc_lookup_action_n(kind); + if (a_o == NULL) { + printk("failed to find %s\n", kind); + return 0; + } + + memset(&a, 0, sizeof(struct tc_action)); + a.ops = a_o; + + if (a_o->walk == NULL) { + printk("tc_dump_action: %s !capable of dumping table\n", kind); + goto rtattr_failure; + } + + nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*t)); + t = NLMSG_DATA(nlh); + t->tca_family = AF_UNSPEC; + + x = (struct rtattr *) skb->tail; + RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + + ret = a_o->walk(skb, cb, RTM_GETACTION, &a); + if (ret < 0) + goto rtattr_failure; + + if (ret > 0) { + x->rta_len = skb->tail - (u8 *) x; + ret = skb->len; + } else + skb_trim(skb, (u8*)x - skb->data); + + nlh->nlmsg_len = skb->tail - b; + if (NETLINK_CB(cb->skb).pid && ret) + nlh->nlmsg_flags |= NLM_F_MULTI; + module_put(a_o->owner); + return skb->len; + +rtattr_failure: +nlmsg_failure: + module_put(a_o->owner); + skb_trim(skb, b - skb->data); + return skb->len; +} + +static int __init tc_action_init(void) +{ + struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC]; + + if (link_p) { + link_p[RTM_NEWACTION-RTM_BASE].doit = tc_ctl_action; + link_p[RTM_DELACTION-RTM_BASE].doit = tc_ctl_action; + link_p[RTM_GETACTION-RTM_BASE].doit = tc_ctl_action; + link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action; + } + + printk("TC classifier action (bugs to netdev@oss.sgi.com cc " + "hadi@cyberus.ca)\n"); + return 0; +} + +subsys_initcall(tc_action_init); + +EXPORT_SYMBOL(tcf_register_action); +EXPORT_SYMBOL(tcf_unregister_action); +EXPORT_SYMBOL(tcf_action_exec); +EXPORT_SYMBOL(tcf_action_dump_1); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c new file mode 100644 index 00000000000..56e66c3fe0f --- /dev/null +++ b/net/sched/cls_api.c @@ -0,0 +1,642 @@ +/* + * net/sched/cls_api.c Packet classifier API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * + * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support + * + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> + +#if 0 /* control */ +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +/* The list of all installed classifier types */ + +static struct tcf_proto_ops *tcf_proto_base; + +/* Protects list of registered TC modules. It is pure SMP lock. */ +static DEFINE_RWLOCK(cls_mod_lock); + +/* Find classifier type by string name */ + +static struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind) +{ + struct tcf_proto_ops *t = NULL; + + if (kind) { + read_lock(&cls_mod_lock); + for (t = tcf_proto_base; t; t = t->next) { + if (rtattr_strcmp(kind, t->kind) == 0) { + if (!try_module_get(t->owner)) + t = NULL; + break; + } + } + read_unlock(&cls_mod_lock); + } + return t; +} + +/* Register(unregister) new classifier type */ + +int register_tcf_proto_ops(struct tcf_proto_ops *ops) +{ + struct tcf_proto_ops *t, **tp; + int rc = -EEXIST; + + write_lock(&cls_mod_lock); + for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) + if (!strcmp(ops->kind, t->kind)) + goto out; + + ops->next = NULL; + *tp = ops; + rc = 0; +out: + write_unlock(&cls_mod_lock); + return rc; +} + +int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) +{ + struct tcf_proto_ops *t, **tp; + int rc = -ENOENT; + + write_lock(&cls_mod_lock); + for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) + if (t == ops) + break; + + if (!t) + goto out; + *tp = t->next; + rc = 0; +out: + write_unlock(&cls_mod_lock); + return rc; +} + +static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct tcf_proto *tp, unsigned long fh, int event); + + +/* Select new prio value from the range, managed by kernel. */ + +static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp) +{ + u32 first = TC_H_MAKE(0xC0000000U,0U); + + if (tp) + first = tp->prio-1; + + return first; +} + +/* Add/change/delete/get a filter node */ + +static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct rtattr **tca; + struct tcmsg *t; + u32 protocol; + u32 prio; + u32 nprio; + u32 parent; + struct net_device *dev; + struct Qdisc *q; + struct tcf_proto **back, **chain; + struct tcf_proto *tp; + struct tcf_proto_ops *tp_ops; + struct Qdisc_class_ops *cops; + unsigned long cl; + unsigned long fh; + int err; + +replay: + tca = arg; + t = NLMSG_DATA(n); + protocol = TC_H_MIN(t->tcm_info); + prio = TC_H_MAJ(t->tcm_info); + nprio = prio; + parent = t->tcm_parent; + cl = 0; + + if (prio == 0) { + /* If no priority is given, user wants we allocated it. */ + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + return -ENOENT; + prio = TC_H_MAKE(0x80000000U,0U); + } + + /* Find head of filter chain. */ + + /* Find link */ + if ((dev = __dev_get_by_index(t->tcm_ifindex)) == NULL) + return -ENODEV; + + /* Find qdisc */ + if (!parent) { + q = dev->qdisc_sleeping; + parent = q->handle; + } else if ((q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent))) == NULL) + return -EINVAL; + + /* Is it classful? */ + if ((cops = q->ops->cl_ops) == NULL) + return -EINVAL; + + /* Do we search for filter, attached to class? */ + if (TC_H_MIN(parent)) { + cl = cops->get(q, parent); + if (cl == 0) + return -ENOENT; + } + + /* And the last stroke */ + chain = cops->tcf_chain(q, cl); + err = -EINVAL; + if (chain == NULL) + goto errout; + + /* Check the chain for existence of proto-tcf with this priority */ + for (back = chain; (tp=*back) != NULL; back = &tp->next) { + if (tp->prio >= prio) { + if (tp->prio == prio) { + if (!nprio || (tp->protocol != protocol && protocol)) + goto errout; + } else + tp = NULL; + break; + } + } + + if (tp == NULL) { + /* Proto-tcf does not exist, create new one */ + + if (tca[TCA_KIND-1] == NULL || !protocol) + goto errout; + + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + goto errout; + + + /* Create new proto tcf */ + + err = -ENOBUFS; + if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL) + goto errout; + err = -EINVAL; + tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]); + if (tp_ops == NULL) { +#ifdef CONFIG_KMOD + struct rtattr *kind = tca[TCA_KIND-1]; + char name[IFNAMSIZ]; + + if (kind != NULL && + rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { + rtnl_unlock(); + request_module("cls_%s", name); + rtnl_lock(); + tp_ops = tcf_proto_lookup_ops(kind); + /* We dropped the RTNL semaphore in order to + * perform the module load. So, even if we + * succeeded in loading the module we have to + * replay the request. We indicate this using + * -EAGAIN. + */ + if (tp_ops != NULL) { + module_put(tp_ops->owner); + err = -EAGAIN; + } + } +#endif + kfree(tp); + goto errout; + } + memset(tp, 0, sizeof(*tp)); + tp->ops = tp_ops; + tp->protocol = protocol; + tp->prio = nprio ? : tcf_auto_prio(*back); + tp->q = q; + tp->classify = tp_ops->classify; + tp->classid = parent; + if ((err = tp_ops->init(tp)) != 0) { + module_put(tp_ops->owner); + kfree(tp); + goto errout; + } + + qdisc_lock_tree(dev); + tp->next = *back; + *back = tp; + qdisc_unlock_tree(dev); + + } else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind)) + goto errout; + + fh = tp->ops->get(tp, t->tcm_handle); + + if (fh == 0) { + if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { + qdisc_lock_tree(dev); + *back = tp->next; + qdisc_unlock_tree(dev); + + tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER); + tcf_destroy(tp); + err = 0; + goto errout; + } + + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + goto errout; + } else { + switch (n->nlmsg_type) { + case RTM_NEWTFILTER: + err = -EEXIST; + if (n->nlmsg_flags&NLM_F_EXCL) + goto errout; + break; + case RTM_DELTFILTER: + err = tp->ops->delete(tp, fh); + if (err == 0) + tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER); + goto errout; + case RTM_GETTFILTER: + err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + goto errout; + default: + err = -EINVAL; + goto errout; + } + } + + err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh); + if (err == 0) + tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + +errout: + if (cl) + cops->put(q, cl); + if (err == -EAGAIN) + /* Replay the request. */ + goto replay; + return err; +} + +static int +tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, + u32 pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = tp->q->dev->ifindex; + tcm->tcm_parent = tp->classid; + tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, tp->ops->kind); + tcm->tcm_handle = fh; + if (RTM_DELTFILTER != event) { + tcm->tcm_handle = 0; + if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) + goto rtattr_failure; + } + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct tcf_proto *tp, unsigned long fh, int event) +{ + struct sk_buff *skb; + u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +} + +struct tcf_dump_args +{ + struct tcf_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg) +{ + struct tcf_dump_args *a = (void*)arg; + + return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid, + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); +} + +static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct net_device *dev; + struct Qdisc *q; + struct tcf_proto *tp, **chain; + struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); + unsigned long cl = 0; + struct Qdisc_class_ops *cops; + struct tcf_dump_args arg; + + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + return skb->len; + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return skb->len; + + read_lock_bh(&qdisc_tree_lock); + if (!tcm->tcm_parent) + q = dev->qdisc_sleeping; + else + q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); + if (!q) + goto out; + if ((cops = q->ops->cl_ops) == NULL) + goto errout; + if (TC_H_MIN(tcm->tcm_parent)) { + cl = cops->get(q, tcm->tcm_parent); + if (cl == 0) + goto errout; + } + chain = cops->tcf_chain(q, cl); + if (chain == NULL) + goto errout; + + s_t = cb->args[0]; + + for (tp=*chain, t=0; tp; tp = tp->next, t++) { + if (t < s_t) continue; + if (TC_H_MAJ(tcm->tcm_info) && + TC_H_MAJ(tcm->tcm_info) != tp->prio) + continue; + if (TC_H_MIN(tcm->tcm_info) && + TC_H_MIN(tcm->tcm_info) != tp->protocol) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + if (cb->args[1] == 0) { + if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) { + break; + } + cb->args[1] = 1; + } + if (tp->ops->walk == NULL) + continue; + arg.w.fn = tcf_node_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1]-1; + arg.w.count = 0; + tp->ops->walk(tp, &arg.w); + cb->args[1] = arg.w.count+1; + if (arg.w.stop) + break; + } + + cb->args[0] = t; + +errout: + if (cl) + cops->put(q, cl); +out: + read_unlock_bh(&qdisc_tree_lock); + dev_put(dev); + return skb->len; +} + +void +tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) +{ +#ifdef CONFIG_NET_CLS_ACT + if (exts->action) { + tcf_action_destroy(exts->action, TCA_ACT_UNBIND); + exts->action = NULL; + } +#elif defined CONFIG_NET_CLS_POLICE + if (exts->police) { + tcf_police_release(exts->police, TCA_ACT_UNBIND); + exts->police = NULL; + } +#endif +} + + +int +tcf_exts_validate(struct tcf_proto *tp, struct rtattr **tb, + struct rtattr *rate_tlv, struct tcf_exts *exts, + struct tcf_ext_map *map) +{ + memset(exts, 0, sizeof(*exts)); + +#ifdef CONFIG_NET_CLS_ACT + { + int err; + struct tc_action *act; + + if (map->police && tb[map->police-1]) { + act = tcf_action_init_1(tb[map->police-1], rate_tlv, "police", + TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err); + if (act == NULL) + return err; + + act->type = TCA_OLD_COMPAT; + exts->action = act; + } else if (map->action && tb[map->action-1]) { + act = tcf_action_init(tb[map->action-1], rate_tlv, NULL, + TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err); + if (act == NULL) + return err; + + exts->action = act; + } + } +#elif defined CONFIG_NET_CLS_POLICE + if (map->police && tb[map->police-1]) { + struct tcf_police *p; + + p = tcf_police_locate(tb[map->police-1], rate_tlv); + if (p == NULL) + return -EINVAL; + + exts->police = p; + } else if (map->action && tb[map->action-1]) + return -EOPNOTSUPP; +#else + if ((map->action && tb[map->action-1]) || + (map->police && tb[map->police-1])) + return -EOPNOTSUPP; +#endif + + return 0; +} + +void +tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, + struct tcf_exts *src) +{ +#ifdef CONFIG_NET_CLS_ACT + if (src->action) { + struct tc_action *act; + tcf_tree_lock(tp); + act = xchg(&dst->action, src->action); + tcf_tree_unlock(tp); + if (act) + tcf_action_destroy(act, TCA_ACT_UNBIND); + } +#elif defined CONFIG_NET_CLS_POLICE + if (src->police) { + struct tcf_police *p; + tcf_tree_lock(tp); + p = xchg(&dst->police, src->police); + tcf_tree_unlock(tp); + if (p) + tcf_police_release(p, TCA_ACT_UNBIND); + } +#endif +} + +int +tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, + struct tcf_ext_map *map) +{ +#ifdef CONFIG_NET_CLS_ACT + if (map->action && exts->action) { + /* + * again for backward compatible mode - we want + * to work with both old and new modes of entering + * tc data even if iproute2 was newer - jhs + */ + struct rtattr * p_rta = (struct rtattr*) skb->tail; + + if (exts->action->type != TCA_OLD_COMPAT) { + RTA_PUT(skb, map->action, 0, NULL); + if (tcf_action_dump(skb, exts->action, 0, 0) < 0) + goto rtattr_failure; + p_rta->rta_len = skb->tail - (u8*)p_rta; + } else if (map->police) { + RTA_PUT(skb, map->police, 0, NULL); + if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0) + goto rtattr_failure; + p_rta->rta_len = skb->tail - (u8*)p_rta; + } + } +#elif defined CONFIG_NET_CLS_POLICE + if (map->police && exts->police) { + struct rtattr * p_rta = (struct rtattr*) skb->tail; + + RTA_PUT(skb, map->police, 0, NULL); + + if (tcf_police_dump(skb, exts->police) < 0) + goto rtattr_failure; + + p_rta->rta_len = skb->tail - (u8*)p_rta; + } +#endif + return 0; +rtattr_failure: __attribute__ ((unused)) + return -1; +} + +int +tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, + struct tcf_ext_map *map) +{ +#ifdef CONFIG_NET_CLS_ACT + if (exts->action) + if (tcf_action_copy_stats(skb, exts->action, 1) < 0) + goto rtattr_failure; +#elif defined CONFIG_NET_CLS_POLICE + if (exts->police) + if (tcf_police_dump_stats(skb, exts->police) < 0) + goto rtattr_failure; +#endif + return 0; +rtattr_failure: __attribute__ ((unused)) + return -1; +} + +static int __init tc_filter_init(void) +{ + struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC]; + + /* Setup rtnetlink links. It is made here to avoid + exporting large number of public symbols. + */ + + if (link_p) { + link_p[RTM_NEWTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_DELTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_GETTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_GETTFILTER-RTM_BASE].dumpit = tc_dump_tfilter; + } + return 0; +} + +subsys_initcall(tc_filter_init); + +EXPORT_SYMBOL(register_tcf_proto_ops); +EXPORT_SYMBOL(unregister_tcf_proto_ops); +EXPORT_SYMBOL(tcf_exts_validate); +EXPORT_SYMBOL(tcf_exts_destroy); +EXPORT_SYMBOL(tcf_exts_change); +EXPORT_SYMBOL(tcf_exts_dump); +EXPORT_SYMBOL(tcf_exts_dump_stats); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c new file mode 100644 index 00000000000..0d2d4415f33 --- /dev/null +++ b/net/sched/cls_basic.c @@ -0,0 +1,303 @@ +/* + * net/sched/cls_basic.c Basic Packet Classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/errno.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> +#include <net/act_api.h> +#include <net/pkt_cls.h> + +struct basic_head +{ + u32 hgenerator; + struct list_head flist; +}; + +struct basic_filter +{ + u32 handle; + struct tcf_exts exts; + struct tcf_ematch_tree ematches; + struct tcf_result res; + struct list_head link; +}; + +static struct tcf_ext_map basic_ext_map = { + .action = TCA_BASIC_ACT, + .police = TCA_BASIC_POLICE +}; + +static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + int r; + struct basic_head *head = (struct basic_head *) tp->root; + struct basic_filter *f; + + list_for_each_entry(f, &head->flist, link) { + if (!tcf_em_tree_match(skb, &f->ematches, NULL)) + continue; + *res = f->res; + r = tcf_exts_exec(skb, &f->exts, res); + if (r < 0) + continue; + return r; + } + return -1; +} + +static unsigned long basic_get(struct tcf_proto *tp, u32 handle) +{ + unsigned long l = 0UL; + struct basic_head *head = (struct basic_head *) tp->root; + struct basic_filter *f; + + if (head == NULL) + return 0UL; + + list_for_each_entry(f, &head->flist, link) + if (f->handle == handle) + l = (unsigned long) f; + + return l; +} + +static void basic_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int basic_init(struct tcf_proto *tp) +{ + return 0; +} + +static inline void basic_delete_filter(struct tcf_proto *tp, + struct basic_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + tcf_exts_destroy(tp, &f->exts); + tcf_em_tree_destroy(tp, &f->ematches); + kfree(f); +} + +static void basic_destroy(struct tcf_proto *tp) +{ + struct basic_head *head = (struct basic_head *) xchg(&tp->root, NULL); + struct basic_filter *f, *n; + + list_for_each_entry_safe(f, n, &head->flist, link) { + list_del(&f->link); + basic_delete_filter(tp, f); + } +} + +static int basic_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct basic_head *head = (struct basic_head *) tp->root; + struct basic_filter *t, *f = (struct basic_filter *) arg; + + list_for_each_entry(t, &head->flist, link) + if (t == f) { + tcf_tree_lock(tp); + list_del(&t->link); + tcf_tree_unlock(tp); + basic_delete_filter(tp, t); + return 0; + } + + return -ENOENT; +} + +static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f, + unsigned long base, struct rtattr **tb, + struct rtattr *est) +{ + int err = -EINVAL; + struct tcf_exts e; + struct tcf_ematch_tree t; + + if (tb[TCA_BASIC_CLASSID-1]) + if (RTA_PAYLOAD(tb[TCA_BASIC_CLASSID-1]) < sizeof(u32)) + return err; + + err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map); + if (err < 0) + return err; + + err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES-1], &t); + if (err < 0) + goto errout; + + if (tb[TCA_BASIC_CLASSID-1]) { + f->res.classid = *(u32*)RTA_DATA(tb[TCA_BASIC_CLASSID-1]); + tcf_bind_filter(tp, &f->res, base); + } + + tcf_exts_change(tp, &f->exts, &e); + tcf_em_tree_change(tp, &f->ematches, &t); + + return 0; +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, + struct rtattr **tca, unsigned long *arg) +{ + int err = -EINVAL; + struct basic_head *head = (struct basic_head *) tp->root; + struct rtattr *tb[TCA_BASIC_MAX]; + struct basic_filter *f = (struct basic_filter *) *arg; + + if (tca[TCA_OPTIONS-1] == NULL) + return -EINVAL; + + if (rtattr_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS-1]) < 0) + return -EINVAL; + + if (f != NULL) { + if (handle && f->handle != handle) + return -EINVAL; + return basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]); + } + + err = -ENOBUFS; + if (head == NULL) { + head = kmalloc(sizeof(*head), GFP_KERNEL); + if (head == NULL) + goto errout; + + memset(head, 0, sizeof(*head)); + INIT_LIST_HEAD(&head->flist); + tp->root = head; + } + + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (f == NULL) + goto errout; + memset(f, 0, sizeof(*f)); + + err = -EINVAL; + if (handle) + f->handle = handle; + else { + int i = 0x80000000; + do { + if (++head->hgenerator == 0x7FFFFFFF) + head->hgenerator = 1; + } while (--i > 0 && basic_get(tp, head->hgenerator)); + + if (i <= 0) { + printk(KERN_ERR "Insufficient number of handles\n"); + goto errout; + } + + f->handle = head->hgenerator; + } + + err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]); + if (err < 0) + goto errout; + + tcf_tree_lock(tp); + list_add(&f->link, &head->flist); + tcf_tree_unlock(tp); + *arg = (unsigned long) f; + + return 0; +errout: + if (*arg == 0UL && f) + kfree(f); + + return err; +} + +static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct basic_head *head = (struct basic_head *) tp->root; + struct basic_filter *f; + + list_for_each_entry(f, &head->flist, link) { + if (arg->count < arg->skip) + goto skip; + + if (arg->fn(tp, (unsigned long) f, arg) < 0) { + arg->stop = 1; + break; + } +skip: + arg->count++; + } +} + +static int basic_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct basic_filter *f = (struct basic_filter *) fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (f == NULL) + return skb->len; + + t->tcm_handle = f->handle; + + rta = (struct rtattr *) b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || + tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) + goto rtattr_failure; + + rta->rta_len = (skb->tail - b); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tcf_proto_ops cls_basic_ops = { + .kind = "basic", + .classify = basic_classify, + .init = basic_init, + .destroy = basic_destroy, + .get = basic_get, + .put = basic_put, + .change = basic_change, + .delete = basic_delete, + .walk = basic_walk, + .dump = basic_dump, + .owner = THIS_MODULE, +}; + +static int __init init_basic(void) +{ + return register_tcf_proto_ops(&cls_basic_ops); +} + +static void __exit exit_basic(void) +{ + unregister_tcf_proto_ops(&cls_basic_ops); +} + +module_init(init_basic) +module_exit(exit_basic) +MODULE_LICENSE("GPL"); + diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c new file mode 100644 index 00000000000..fdfc83af3d1 --- /dev/null +++ b/net/sched/cls_fw.c @@ -0,0 +1,378 @@ +/* + * net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one + * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel). + * Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension + * + * JHS: We should remove the CONFIG_NET_CLS_IND from here + * eventually when the meta match extension is made available + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/netfilter.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/act_api.h> +#include <net/pkt_cls.h> + +struct fw_head +{ + struct fw_filter *ht[256]; +}; + +struct fw_filter +{ + struct fw_filter *next; + u32 id; + struct tcf_result res; +#ifdef CONFIG_NET_CLS_IND + char indev[IFNAMSIZ]; +#endif /* CONFIG_NET_CLS_IND */ + struct tcf_exts exts; +}; + +static struct tcf_ext_map fw_ext_map = { + .action = TCA_FW_ACT, + .police = TCA_FW_POLICE +}; + +static __inline__ int fw_hash(u32 handle) +{ + return handle&0xFF; +} + +static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct fw_head *head = (struct fw_head*)tp->root; + struct fw_filter *f; + int r; +#ifdef CONFIG_NETFILTER + u32 id = skb->nfmark; +#else + u32 id = 0; +#endif + + if (head != NULL) { + for (f=head->ht[fw_hash(id)]; f; f=f->next) { + if (f->id == id) { + *res = f->res; +#ifdef CONFIG_NET_CLS_IND + if (!tcf_match_indev(skb, f->indev)) + continue; +#endif /* CONFIG_NET_CLS_IND */ + r = tcf_exts_exec(skb, &f->exts, res); + if (r < 0) + continue; + + return r; + } + } + } else { + /* old method */ + if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) { + res->classid = id; + res->class = 0; + return 0; + } + } + + return -1; +} + +static unsigned long fw_get(struct tcf_proto *tp, u32 handle) +{ + struct fw_head *head = (struct fw_head*)tp->root; + struct fw_filter *f; + + if (head == NULL) + return 0; + + for (f=head->ht[fw_hash(handle)]; f; f=f->next) { + if (f->id == handle) + return (unsigned long)f; + } + return 0; +} + +static void fw_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int fw_init(struct tcf_proto *tp) +{ + return 0; +} + +static inline void +fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + tcf_exts_destroy(tp, &f->exts); + kfree(f); +} + +static void fw_destroy(struct tcf_proto *tp) +{ + struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL); + struct fw_filter *f; + int h; + + if (head == NULL) + return; + + for (h=0; h<256; h++) { + while ((f=head->ht[h]) != NULL) { + head->ht[h] = f->next; + fw_delete_filter(tp, f); + } + } + kfree(head); +} + +static int fw_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct fw_head *head = (struct fw_head*)tp->root; + struct fw_filter *f = (struct fw_filter*)arg; + struct fw_filter **fp; + + if (head == NULL || f == NULL) + goto out; + + for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + tcf_tree_lock(tp); + *fp = f->next; + tcf_tree_unlock(tp); + fw_delete_filter(tp, f); + return 0; + } + } +out: + return -EINVAL; +} + +static int +fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, + struct rtattr **tb, struct rtattr **tca, unsigned long base) +{ + struct tcf_exts e; + int err; + + err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &fw_ext_map); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_FW_CLASSID-1]) { + if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != sizeof(u32)) + goto errout; + f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]); + tcf_bind_filter(tp, &f->res, base); + } + +#ifdef CONFIG_NET_CLS_IND + if (tb[TCA_FW_INDEV-1]) { + err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV-1]); + if (err < 0) + goto errout; + } +#endif /* CONFIG_NET_CLS_IND */ + + tcf_exts_change(tp, &f->exts, &e); + + return 0; +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int fw_change(struct tcf_proto *tp, unsigned long base, + u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct fw_head *head = (struct fw_head*)tp->root; + struct fw_filter *f = (struct fw_filter *) *arg; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_FW_MAX]; + int err; + + if (!opt) + return handle ? -EINVAL : 0; + + if (rtattr_parse_nested(tb, TCA_FW_MAX, opt) < 0) + return -EINVAL; + + if (f != NULL) { + if (f->id != handle && handle) + return -EINVAL; + return fw_change_attrs(tp, f, tb, tca, base); + } + + if (!handle) + return -EINVAL; + + if (head == NULL) { + head = kmalloc(sizeof(struct fw_head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + memset(head, 0, sizeof(*head)); + + tcf_tree_lock(tp); + tp->root = head; + tcf_tree_unlock(tp); + } + + f = kmalloc(sizeof(struct fw_filter), GFP_KERNEL); + if (f == NULL) + return -ENOBUFS; + memset(f, 0, sizeof(*f)); + + f->id = handle; + + err = fw_change_attrs(tp, f, tb, tca, base); + if (err < 0) + goto errout; + + f->next = head->ht[fw_hash(handle)]; + tcf_tree_lock(tp); + head->ht[fw_hash(handle)] = f; + tcf_tree_unlock(tp); + + *arg = (unsigned long)f; + return 0; + +errout: + if (f) + kfree(f); + return err; +} + +static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct fw_head *head = (struct fw_head*)tp->root; + int h; + + if (head == NULL) + arg->stop = 1; + + if (arg->stop) + return; + + for (h = 0; h < 256; h++) { + struct fw_filter *f; + + for (f = head->ht[h]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static int fw_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct fw_filter *f = (struct fw_filter*)fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (f == NULL) + return skb->len; + + t->tcm_handle = f->id; + + if (!f->res.classid && !tcf_exts_is_available(&f->exts)) + return skb->len; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + if (f->res.classid) + RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid); +#ifdef CONFIG_NET_CLS_IND + if (strlen(f->indev)) + RTA_PUT(skb, TCA_FW_INDEV, IFNAMSIZ, f->indev); +#endif /* CONFIG_NET_CLS_IND */ + + if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) + goto rtattr_failure; + + rta->rta_len = skb->tail - b; + + if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0) + goto rtattr_failure; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tcf_proto_ops cls_fw_ops = { + .next = NULL, + .kind = "fw", + .classify = fw_classify, + .init = fw_init, + .destroy = fw_destroy, + .get = fw_get, + .put = fw_put, + .change = fw_change, + .delete = fw_delete, + .walk = fw_walk, + .dump = fw_dump, + .owner = THIS_MODULE, +}; + +static int __init init_fw(void) +{ + return register_tcf_proto_ops(&cls_fw_ops); +} + +static void __exit exit_fw(void) +{ + unregister_tcf_proto_ops(&cls_fw_ops); +} + +module_init(init_fw) +module_exit(exit_fw) +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c new file mode 100644 index 00000000000..02996ac05c7 --- /dev/null +++ b/net/sched/cls_route.c @@ -0,0 +1,639 @@ +/* + * net/sched/cls_route.c ROUTE4 classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/act_api.h> +#include <net/pkt_cls.h> + +/* + 1. For now we assume that route tags < 256. + It allows to use direct table lookups, instead of hash tables. + 2. For now we assume that "from TAG" and "fromdev DEV" statements + are mutually exclusive. + 3. "to TAG from ANY" has higher priority, than "to ANY from XXX" + */ + +struct route4_fastmap +{ + struct route4_filter *filter; + u32 id; + int iif; +}; + +struct route4_head +{ + struct route4_fastmap fastmap[16]; + struct route4_bucket *table[256+1]; +}; + +struct route4_bucket +{ + /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */ + struct route4_filter *ht[16+16+1]; +}; + +struct route4_filter +{ + struct route4_filter *next; + u32 id; + int iif; + + struct tcf_result res; + struct tcf_exts exts; + u32 handle; + struct route4_bucket *bkt; +}; + +#define ROUTE4_FAILURE ((struct route4_filter*)(-1L)) + +static struct tcf_ext_map route_ext_map = { + .police = TCA_ROUTE4_POLICE, + .action = TCA_ROUTE4_ACT +}; + +static __inline__ int route4_fastmap_hash(u32 id, int iif) +{ + return id&0xF; +} + +static inline +void route4_reset_fastmap(struct net_device *dev, struct route4_head *head, u32 id) +{ + spin_lock_bh(&dev->queue_lock); + memset(head->fastmap, 0, sizeof(head->fastmap)); + spin_unlock_bh(&dev->queue_lock); +} + +static void __inline__ +route4_set_fastmap(struct route4_head *head, u32 id, int iif, + struct route4_filter *f) +{ + int h = route4_fastmap_hash(id, iif); + head->fastmap[h].id = id; + head->fastmap[h].iif = iif; + head->fastmap[h].filter = f; +} + +static __inline__ int route4_hash_to(u32 id) +{ + return id&0xFF; +} + +static __inline__ int route4_hash_from(u32 id) +{ + return (id>>16)&0xF; +} + +static __inline__ int route4_hash_iif(int iif) +{ + return 16 + ((iif>>16)&0xF); +} + +static __inline__ int route4_hash_wild(void) +{ + return 32; +} + +#define ROUTE4_APPLY_RESULT() \ +{ \ + *res = f->res; \ + if (tcf_exts_is_available(&f->exts)) { \ + int r = tcf_exts_exec(skb, &f->exts, res); \ + if (r < 0) { \ + dont_cache = 1; \ + continue; \ + } \ + return r; \ + } else if (!dont_cache) \ + route4_set_fastmap(head, id, iif, f); \ + return 0; \ +} + +static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct route4_head *head = (struct route4_head*)tp->root; + struct dst_entry *dst; + struct route4_bucket *b; + struct route4_filter *f; + u32 id, h; + int iif, dont_cache = 0; + + if ((dst = skb->dst) == NULL) + goto failure; + + id = dst->tclassid; + if (head == NULL) + goto old_method; + + iif = ((struct rtable*)dst)->fl.iif; + + h = route4_fastmap_hash(id, iif); + if (id == head->fastmap[h].id && + iif == head->fastmap[h].iif && + (f = head->fastmap[h].filter) != NULL) { + if (f == ROUTE4_FAILURE) + goto failure; + + *res = f->res; + return 0; + } + + h = route4_hash_to(id); + +restart: + if ((b = head->table[h]) != NULL) { + for (f = b->ht[route4_hash_from(id)]; f; f = f->next) + if (f->id == id) + ROUTE4_APPLY_RESULT(); + + for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next) + if (f->iif == iif) + ROUTE4_APPLY_RESULT(); + + for (f = b->ht[route4_hash_wild()]; f; f = f->next) + ROUTE4_APPLY_RESULT(); + + } + if (h < 256) { + h = 256; + id &= ~0xFFFF; + goto restart; + } + + if (!dont_cache) + route4_set_fastmap(head, id, iif, ROUTE4_FAILURE); +failure: + return -1; + +old_method: + if (id && (TC_H_MAJ(id) == 0 || + !(TC_H_MAJ(id^tp->q->handle)))) { + res->classid = id; + res->class = 0; + return 0; + } + return -1; +} + +static inline u32 to_hash(u32 id) +{ + u32 h = id&0xFF; + if (id&0x8000) + h += 256; + return h; +} + +static inline u32 from_hash(u32 id) +{ + id &= 0xFFFF; + if (id == 0xFFFF) + return 32; + if (!(id & 0x8000)) { + if (id > 255) + return 256; + return id&0xF; + } + return 16 + (id&0xF); +} + +static unsigned long route4_get(struct tcf_proto *tp, u32 handle) +{ + struct route4_head *head = (struct route4_head*)tp->root; + struct route4_bucket *b; + struct route4_filter *f; + unsigned h1, h2; + + if (!head) + return 0; + + h1 = to_hash(handle); + if (h1 > 256) + return 0; + + h2 = from_hash(handle>>16); + if (h2 > 32) + return 0; + + if ((b = head->table[h1]) != NULL) { + for (f = b->ht[h2]; f; f = f->next) + if (f->handle == handle) + return (unsigned long)f; + } + return 0; +} + +static void route4_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int route4_init(struct tcf_proto *tp) +{ + return 0; +} + +static inline void +route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + tcf_exts_destroy(tp, &f->exts); + kfree(f); +} + +static void route4_destroy(struct tcf_proto *tp) +{ + struct route4_head *head = xchg(&tp->root, NULL); + int h1, h2; + + if (head == NULL) + return; + + for (h1=0; h1<=256; h1++) { + struct route4_bucket *b; + + if ((b = head->table[h1]) != NULL) { + for (h2=0; h2<=32; h2++) { + struct route4_filter *f; + + while ((f = b->ht[h2]) != NULL) { + b->ht[h2] = f->next; + route4_delete_filter(tp, f); + } + } + kfree(b); + } + } + kfree(head); +} + +static int route4_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct route4_head *head = (struct route4_head*)tp->root; + struct route4_filter **fp, *f = (struct route4_filter*)arg; + unsigned h = 0; + struct route4_bucket *b; + int i; + + if (!head || !f) + return -EINVAL; + + h = f->handle; + b = f->bkt; + + for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + tcf_tree_lock(tp); + *fp = f->next; + tcf_tree_unlock(tp); + + route4_reset_fastmap(tp->q->dev, head, f->id); + route4_delete_filter(tp, f); + + /* Strip tree */ + + for (i=0; i<=32; i++) + if (b->ht[i]) + return 0; + + /* OK, session has no flows */ + tcf_tree_lock(tp); + head->table[to_hash(h)] = NULL; + tcf_tree_unlock(tp); + + kfree(b); + return 0; + } + } + return 0; +} + +static int route4_set_parms(struct tcf_proto *tp, unsigned long base, + struct route4_filter *f, u32 handle, struct route4_head *head, + struct rtattr **tb, struct rtattr *est, int new) +{ + int err; + u32 id = 0, to = 0, nhandle = 0x8000; + struct route4_filter *fp; + unsigned int h1; + struct route4_bucket *b; + struct tcf_exts e; + + err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_ROUTE4_CLASSID-1]) + if (RTA_PAYLOAD(tb[TCA_ROUTE4_CLASSID-1]) < sizeof(u32)) + goto errout; + + if (tb[TCA_ROUTE4_TO-1]) { + if (new && handle & 0x8000) + goto errout; + if (RTA_PAYLOAD(tb[TCA_ROUTE4_TO-1]) < sizeof(u32)) + goto errout; + to = *(u32*)RTA_DATA(tb[TCA_ROUTE4_TO-1]); + if (to > 0xFF) + goto errout; + nhandle = to; + } + + if (tb[TCA_ROUTE4_FROM-1]) { + if (tb[TCA_ROUTE4_IIF-1]) + goto errout; + if (RTA_PAYLOAD(tb[TCA_ROUTE4_FROM-1]) < sizeof(u32)) + goto errout; + id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_FROM-1]); + if (id > 0xFF) + goto errout; + nhandle |= id << 16; + } else if (tb[TCA_ROUTE4_IIF-1]) { + if (RTA_PAYLOAD(tb[TCA_ROUTE4_IIF-1]) < sizeof(u32)) + goto errout; + id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_IIF-1]); + if (id > 0x7FFF) + goto errout; + nhandle |= (id | 0x8000) << 16; + } else + nhandle |= 0xFFFF << 16; + + if (handle && new) { + nhandle |= handle & 0x7F00; + if (nhandle != handle) + goto errout; + } + + h1 = to_hash(nhandle); + if ((b = head->table[h1]) == NULL) { + err = -ENOBUFS; + b = kmalloc(sizeof(struct route4_bucket), GFP_KERNEL); + if (b == NULL) + goto errout; + memset(b, 0, sizeof(*b)); + + tcf_tree_lock(tp); + head->table[h1] = b; + tcf_tree_unlock(tp); + } else { + unsigned int h2 = from_hash(nhandle >> 16); + err = -EEXIST; + for (fp = b->ht[h2]; fp; fp = fp->next) + if (fp->handle == f->handle) + goto errout; + } + + tcf_tree_lock(tp); + if (tb[TCA_ROUTE4_TO-1]) + f->id = to; + + if (tb[TCA_ROUTE4_FROM-1]) + f->id = to | id<<16; + else if (tb[TCA_ROUTE4_IIF-1]) + f->iif = id; + + f->handle = nhandle; + f->bkt = b; + tcf_tree_unlock(tp); + + if (tb[TCA_ROUTE4_CLASSID-1]) { + f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]); + tcf_bind_filter(tp, &f->res, base); + } + + tcf_exts_change(tp, &f->exts, &e); + + return 0; +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int route4_change(struct tcf_proto *tp, unsigned long base, + u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct route4_head *head = tp->root; + struct route4_filter *f, *f1, **fp; + struct route4_bucket *b; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_ROUTE4_MAX]; + unsigned int h, th; + u32 old_handle = 0; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + if (rtattr_parse_nested(tb, TCA_ROUTE4_MAX, opt) < 0) + return -EINVAL; + + if ((f = (struct route4_filter*)*arg) != NULL) { + if (f->handle != handle && handle) + return -EINVAL; + + if (f->bkt) + old_handle = f->handle; + + err = route4_set_parms(tp, base, f, handle, head, tb, + tca[TCA_RATE-1], 0); + if (err < 0) + return err; + + goto reinsert; + } + + err = -ENOBUFS; + if (head == NULL) { + head = kmalloc(sizeof(struct route4_head), GFP_KERNEL); + if (head == NULL) + goto errout; + memset(head, 0, sizeof(struct route4_head)); + + tcf_tree_lock(tp); + tp->root = head; + tcf_tree_unlock(tp); + } + + f = kmalloc(sizeof(struct route4_filter), GFP_KERNEL); + if (f == NULL) + goto errout; + memset(f, 0, sizeof(*f)); + + err = route4_set_parms(tp, base, f, handle, head, tb, + tca[TCA_RATE-1], 1); + if (err < 0) + goto errout; + +reinsert: + h = from_hash(f->handle >> 16); + for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next) + if (f->handle < f1->handle) + break; + + f->next = f1; + tcf_tree_lock(tp); + *fp = f; + + if (old_handle && f->handle != old_handle) { + th = to_hash(old_handle); + h = from_hash(old_handle >> 16); + if ((b = head->table[th]) != NULL) { + for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + *fp = f->next; + break; + } + } + } + } + tcf_tree_unlock(tp); + + route4_reset_fastmap(tp->q->dev, head, f->id); + *arg = (unsigned long)f; + return 0; + +errout: + if (f) + kfree(f); + return err; +} + +static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct route4_head *head = tp->root; + unsigned h, h1; + + if (head == NULL) + arg->stop = 1; + + if (arg->stop) + return; + + for (h = 0; h <= 256; h++) { + struct route4_bucket *b = head->table[h]; + + if (b) { + for (h1 = 0; h1 <= 32; h1++) { + struct route4_filter *f; + + for (f = b->ht[h1]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } + } + } +} + +static int route4_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct route4_filter *f = (struct route4_filter*)fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + u32 id; + + if (f == NULL) + return skb->len; + + t->tcm_handle = f->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + if (!(f->handle&0x8000)) { + id = f->id&0xFF; + RTA_PUT(skb, TCA_ROUTE4_TO, sizeof(id), &id); + } + if (f->handle&0x80000000) { + if ((f->handle>>16) != 0xFFFF) + RTA_PUT(skb, TCA_ROUTE4_IIF, sizeof(f->iif), &f->iif); + } else { + id = f->id>>16; + RTA_PUT(skb, TCA_ROUTE4_FROM, sizeof(id), &id); + } + if (f->res.classid) + RTA_PUT(skb, TCA_ROUTE4_CLASSID, 4, &f->res.classid); + + if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0) + goto rtattr_failure; + + rta->rta_len = skb->tail - b; + + if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0) + goto rtattr_failure; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tcf_proto_ops cls_route4_ops = { + .next = NULL, + .kind = "route", + .classify = route4_classify, + .init = route4_init, + .destroy = route4_destroy, + .get = route4_get, + .put = route4_put, + .change = route4_change, + .delete = route4_delete, + .walk = route4_walk, + .dump = route4_dump, + .owner = THIS_MODULE, +}; + +static int __init init_route4(void) +{ + return register_tcf_proto_ops(&cls_route4_ops); +} + +static void __exit exit_route4(void) +{ + unregister_tcf_proto_ops(&cls_route4_ops); +} + +module_init(init_route4) +module_exit(exit_route4) +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c new file mode 100644 index 00000000000..ad2613790d8 --- /dev/null +++ b/net/sched/cls_rsvp.c @@ -0,0 +1,43 @@ +/* + * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/act_api.h> +#include <net/pkt_cls.h> + +#define RSVP_DST_LEN 1 +#define RSVP_ID "rsvp" +#define RSVP_OPS cls_rsvp_ops + +#include "cls_rsvp.h" +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h new file mode 100644 index 00000000000..232fb919681 --- /dev/null +++ b/net/sched/cls_rsvp.h @@ -0,0 +1,667 @@ +/* + * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +/* + Comparing to general packet classification problem, + RSVP needs only sevaral relatively simple rules: + + * (dst, protocol) are always specified, + so that we are able to hash them. + * src may be exact, or may be wildcard, so that + we can keep a hash table plus one wildcard entry. + * source port (or flow label) is important only if src is given. + + IMPLEMENTATION. + + We use a two level hash table: The top level is keyed by + destination address and protocol ID, every bucket contains a list + of "rsvp sessions", identified by destination address, protocol and + DPI(="Destination Port ID"): triple (key, mask, offset). + + Every bucket has a smaller hash table keyed by source address + (cf. RSVP flowspec) and one wildcard entry for wildcard reservations. + Every bucket is again a list of "RSVP flows", selected by + source address and SPI(="Source Port ID" here rather than + "security parameter index"): triple (key, mask, offset). + + + NOTE 1. All the packets with IPv6 extension headers (but AH and ESP) + and all fragmented packets go to the best-effort traffic class. + + + NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires + only one "Generalized Port Identifier". So that for classic + ah, esp (and udp,tcp) both *pi should coincide or one of them + should be wildcard. + + At first sight, this redundancy is just a waste of CPU + resources. But DPI and SPI add the possibility to assign different + priorities to GPIs. Look also at note 4 about tunnels below. + + + NOTE 3. One complication is the case of tunneled packets. + We implement it as following: if the first lookup + matches a special session with "tunnelhdr" value not zero, + flowid doesn't contain the true flow ID, but the tunnel ID (1...255). + In this case, we pull tunnelhdr bytes and restart lookup + with tunnel ID added to the list of keys. Simple and stupid 8)8) + It's enough for PIMREG and IPIP. + + + NOTE 4. Two GPIs make it possible to parse even GRE packets. + F.e. DPI can select ETH_P_IP (and necessary flags to make + tunnelhdr correct) in GRE protocol field and SPI matches + GRE key. Is it not nice? 8)8) + + + Well, as result, despite its simplicity, we get a pretty + powerful classification engine. */ + +#include <linux/config.h> + +struct rsvp_head +{ + u32 tmap[256/32]; + u32 hgenerator; + u8 tgenerator; + struct rsvp_session *ht[256]; +}; + +struct rsvp_session +{ + struct rsvp_session *next; + u32 dst[RSVP_DST_LEN]; + struct tc_rsvp_gpi dpi; + u8 protocol; + u8 tunnelid; + /* 16 (src,sport) hash slots, and one wildcard source slot */ + struct rsvp_filter *ht[16+1]; +}; + + +struct rsvp_filter +{ + struct rsvp_filter *next; + u32 src[RSVP_DST_LEN]; + struct tc_rsvp_gpi spi; + u8 tunnelhdr; + + struct tcf_result res; + struct tcf_exts exts; + + u32 handle; + struct rsvp_session *sess; +}; + +static __inline__ unsigned hash_dst(u32 *dst, u8 protocol, u8 tunnelid) +{ + unsigned h = dst[RSVP_DST_LEN-1]; + h ^= h>>16; + h ^= h>>8; + return (h ^ protocol ^ tunnelid) & 0xFF; +} + +static __inline__ unsigned hash_src(u32 *src) +{ + unsigned h = src[RSVP_DST_LEN-1]; + h ^= h>>16; + h ^= h>>8; + h ^= h>>4; + return h & 0xF; +} + +static struct tcf_ext_map rsvp_ext_map = { + .police = TCA_RSVP_POLICE, + .action = TCA_RSVP_ACT +}; + +#define RSVP_APPLY_RESULT() \ +{ \ + int r = tcf_exts_exec(skb, &f->exts, res); \ + if (r < 0) \ + continue; \ + else if (r > 0) \ + return r; \ +} + +static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session *s; + struct rsvp_filter *f; + unsigned h1, h2; + u32 *dst, *src; + u8 protocol; + u8 tunnelid = 0; + u8 *xprt; +#if RSVP_DST_LEN == 4 + struct ipv6hdr *nhptr = skb->nh.ipv6h; +#else + struct iphdr *nhptr = skb->nh.iph; +#endif + +restart: + +#if RSVP_DST_LEN == 4 + src = &nhptr->saddr.s6_addr32[0]; + dst = &nhptr->daddr.s6_addr32[0]; + protocol = nhptr->nexthdr; + xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr); +#else + src = &nhptr->saddr; + dst = &nhptr->daddr; + protocol = nhptr->protocol; + xprt = ((u8*)nhptr) + (nhptr->ihl<<2); + if (nhptr->frag_off&__constant_htons(IP_MF|IP_OFFSET)) + return -1; +#endif + + h1 = hash_dst(dst, protocol, tunnelid); + h2 = hash_src(src); + + for (s = sht[h1]; s; s = s->next) { + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && + protocol == s->protocol && + !(s->dpi.mask & (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) +#if RSVP_DST_LEN == 4 + && dst[0] == s->dst[0] + && dst[1] == s->dst[1] + && dst[2] == s->dst[2] +#endif + && tunnelid == s->tunnelid) { + + for (f = s->ht[h2]; f; f = f->next) { + if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] && + !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key)) +#if RSVP_DST_LEN == 4 + && src[0] == f->src[0] + && src[1] == f->src[1] + && src[2] == f->src[2] +#endif + ) { + *res = f->res; + RSVP_APPLY_RESULT(); + +matched: + if (f->tunnelhdr == 0) + return 0; + + tunnelid = f->res.classid; + nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr)); + goto restart; + } + } + + /* And wildcard bucket... */ + for (f = s->ht[16]; f; f = f->next) { + *res = f->res; + RSVP_APPLY_RESULT(); + goto matched; + } + return -1; + } + } + return -1; +} + +static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) +{ + struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session *s; + struct rsvp_filter *f; + unsigned h1 = handle&0xFF; + unsigned h2 = (handle>>8)&0xFF; + + if (h2 > 16) + return 0; + + for (s = sht[h1]; s; s = s->next) { + for (f = s->ht[h2]; f; f = f->next) { + if (f->handle == handle) + return (unsigned long)f; + } + } + return 0; +} + +static void rsvp_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int rsvp_init(struct tcf_proto *tp) +{ + struct rsvp_head *data; + + data = kmalloc(sizeof(struct rsvp_head), GFP_KERNEL); + if (data) { + memset(data, 0, sizeof(struct rsvp_head)); + tp->root = data; + return 0; + } + return -ENOBUFS; +} + +static inline void +rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + tcf_exts_destroy(tp, &f->exts); + kfree(f); +} + +static void rsvp_destroy(struct tcf_proto *tp) +{ + struct rsvp_head *data = xchg(&tp->root, NULL); + struct rsvp_session **sht; + int h1, h2; + + if (data == NULL) + return; + + sht = data->ht; + + for (h1=0; h1<256; h1++) { + struct rsvp_session *s; + + while ((s = sht[h1]) != NULL) { + sht[h1] = s->next; + + for (h2=0; h2<=16; h2++) { + struct rsvp_filter *f; + + while ((f = s->ht[h2]) != NULL) { + s->ht[h2] = f->next; + rsvp_delete_filter(tp, f); + } + } + kfree(s); + } + } + kfree(data); +} + +static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg; + unsigned h = f->handle; + struct rsvp_session **sp; + struct rsvp_session *s = f->sess; + int i; + + for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + tcf_tree_lock(tp); + *fp = f->next; + tcf_tree_unlock(tp); + rsvp_delete_filter(tp, f); + + /* Strip tree */ + + for (i=0; i<=16; i++) + if (s->ht[i]) + return 0; + + /* OK, session has no flows */ + for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF]; + *sp; sp = &(*sp)->next) { + if (*sp == s) { + tcf_tree_lock(tp); + *sp = s->next; + tcf_tree_unlock(tp); + + kfree(s); + return 0; + } + } + + return 0; + } + } + return 0; +} + +static unsigned gen_handle(struct tcf_proto *tp, unsigned salt) +{ + struct rsvp_head *data = tp->root; + int i = 0xFFFF; + + while (i-- > 0) { + u32 h; + if ((data->hgenerator += 0x10000) == 0) + data->hgenerator = 0x10000; + h = data->hgenerator|salt; + if (rsvp_get(tp, h) == 0) + return h; + } + return 0; +} + +static int tunnel_bts(struct rsvp_head *data) +{ + int n = data->tgenerator>>5; + u32 b = 1<<(data->tgenerator&0x1F); + + if (data->tmap[n]&b) + return 0; + data->tmap[n] |= b; + return 1; +} + +static void tunnel_recycle(struct rsvp_head *data) +{ + struct rsvp_session **sht = data->ht; + u32 tmap[256/32]; + int h1, h2; + + memset(tmap, 0, sizeof(tmap)); + + for (h1=0; h1<256; h1++) { + struct rsvp_session *s; + for (s = sht[h1]; s; s = s->next) { + for (h2=0; h2<=16; h2++) { + struct rsvp_filter *f; + + for (f = s->ht[h2]; f; f = f->next) { + if (f->tunnelhdr == 0) + continue; + data->tgenerator = f->res.classid; + tunnel_bts(data); + } + } + } + } + + memcpy(data->tmap, tmap, sizeof(tmap)); +} + +static u32 gen_tunnel(struct rsvp_head *data) +{ + int i, k; + + for (k=0; k<2; k++) { + for (i=255; i>0; i--) { + if (++data->tgenerator == 0) + data->tgenerator = 1; + if (tunnel_bts(data)) + return data->tgenerator; + } + tunnel_recycle(data); + } + return 0; +} + +static int rsvp_change(struct tcf_proto *tp, unsigned long base, + u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct rsvp_head *data = tp->root; + struct rsvp_filter *f, **fp; + struct rsvp_session *s, **sp; + struct tc_rsvp_pinfo *pinfo = NULL; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_RSVP_MAX]; + struct tcf_exts e; + unsigned h1, h2; + u32 *dst; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + if (rtattr_parse_nested(tb, TCA_RSVP_MAX, opt) < 0) + return -EINVAL; + + err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &rsvp_ext_map); + if (err < 0) + return err; + + if ((f = (struct rsvp_filter*)*arg) != NULL) { + /* Node exists: adjust only classid */ + + if (f->handle != handle && handle) + goto errout2; + if (tb[TCA_RSVP_CLASSID-1]) { + f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); + tcf_bind_filter(tp, &f->res, base); + } + + tcf_exts_change(tp, &f->exts, &e); + return 0; + } + + /* Now more serious part... */ + err = -EINVAL; + if (handle) + goto errout2; + if (tb[TCA_RSVP_DST-1] == NULL) + goto errout2; + + err = -ENOBUFS; + f = kmalloc(sizeof(struct rsvp_filter), GFP_KERNEL); + if (f == NULL) + goto errout2; + + memset(f, 0, sizeof(*f)); + h2 = 16; + if (tb[TCA_RSVP_SRC-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_SRC-1]) != sizeof(f->src)) + goto errout; + memcpy(f->src, RTA_DATA(tb[TCA_RSVP_SRC-1]), sizeof(f->src)); + h2 = hash_src(f->src); + } + if (tb[TCA_RSVP_PINFO-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO-1]) < sizeof(struct tc_rsvp_pinfo)) + goto errout; + pinfo = RTA_DATA(tb[TCA_RSVP_PINFO-1]); + f->spi = pinfo->spi; + f->tunnelhdr = pinfo->tunnelhdr; + } + if (tb[TCA_RSVP_CLASSID-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_CLASSID-1]) != 4) + goto errout; + f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); + } + + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_DST-1]) != sizeof(f->src)) + goto errout; + dst = RTA_DATA(tb[TCA_RSVP_DST-1]); + h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0); + + err = -ENOMEM; + if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0) + goto errout; + + if (f->tunnelhdr) { + err = -EINVAL; + if (f->res.classid > 255) + goto errout; + + err = -ENOMEM; + if (f->res.classid == 0 && + (f->res.classid = gen_tunnel(data)) == 0) + goto errout; + } + + for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) { + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && + pinfo && pinfo->protocol == s->protocol && + memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 +#if RSVP_DST_LEN == 4 + && dst[0] == s->dst[0] + && dst[1] == s->dst[1] + && dst[2] == s->dst[2] +#endif + && pinfo->tunnelid == s->tunnelid) { + +insert: + /* OK, we found appropriate session */ + + fp = &s->ht[h2]; + + f->sess = s; + if (f->tunnelhdr == 0) + tcf_bind_filter(tp, &f->res, base); + + tcf_exts_change(tp, &f->exts, &e); + + for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) + if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask) + break; + f->next = *fp; + wmb(); + *fp = f; + + *arg = (unsigned long)f; + return 0; + } + } + + /* No session found. Create new one. */ + + err = -ENOBUFS; + s = kmalloc(sizeof(struct rsvp_session), GFP_KERNEL); + if (s == NULL) + goto errout; + memset(s, 0, sizeof(*s)); + memcpy(s->dst, dst, sizeof(s->dst)); + + if (pinfo) { + s->dpi = pinfo->dpi; + s->protocol = pinfo->protocol; + s->tunnelid = pinfo->tunnelid; + } + for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) { + if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask) + break; + } + s->next = *sp; + wmb(); + *sp = s; + + goto insert; + +errout: + if (f) + kfree(f); +errout2: + tcf_exts_destroy(tp, &e); + return err; +} + +static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct rsvp_head *head = tp->root; + unsigned h, h1; + + if (arg->stop) + return; + + for (h = 0; h < 256; h++) { + struct rsvp_session *s; + + for (s = head->ht[h]; s; s = s->next) { + for (h1 = 0; h1 <= 16; h1++) { + struct rsvp_filter *f; + + for (f = s->ht[h1]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } + } + } +} + +static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct rsvp_filter *f = (struct rsvp_filter*)fh; + struct rsvp_session *s; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_rsvp_pinfo pinfo; + + if (f == NULL) + return skb->len; + s = f->sess; + + t->tcm_handle = f->handle; + + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + RTA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); + pinfo.dpi = s->dpi; + pinfo.spi = f->spi; + pinfo.protocol = s->protocol; + pinfo.tunnelid = s->tunnelid; + pinfo.tunnelhdr = f->tunnelhdr; + RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); + if (f->res.classid) + RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid); + if (((f->handle>>8)&0xFF) != 16) + RTA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); + + if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) + goto rtattr_failure; + + rta->rta_len = skb->tail - b; + + if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0) + goto rtattr_failure; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tcf_proto_ops RSVP_OPS = { + .next = NULL, + .kind = RSVP_ID, + .classify = rsvp_classify, + .init = rsvp_init, + .destroy = rsvp_destroy, + .get = rsvp_get, + .put = rsvp_put, + .change = rsvp_change, + .delete = rsvp_delete, + .walk = rsvp_walk, + .dump = rsvp_dump, + .owner = THIS_MODULE, +}; + +static int __init init_rsvp(void) +{ + return register_tcf_proto_ops(&RSVP_OPS); +} + +static void __exit exit_rsvp(void) +{ + unregister_tcf_proto_ops(&RSVP_OPS); +} + +module_init(init_rsvp) +module_exit(exit_rsvp) diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c new file mode 100644 index 00000000000..fde51f7848e --- /dev/null +++ b/net/sched/cls_rsvp6.c @@ -0,0 +1,44 @@ +/* + * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/act_api.h> +#include <net/pkt_cls.h> + +#define RSVP_DST_LEN 4 +#define RSVP_ID "rsvp6" +#define RSVP_OPS cls_rsvp6_ops + +#include "cls_rsvp.h" +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c new file mode 100644 index 00000000000..404d9d83a7f --- /dev/null +++ b/net/sched/cls_tcindex.c @@ -0,0 +1,537 @@ +/* + * net/sched/cls_tcindex.c Packet classifier for skb->tc_index + * + * Written 1998,1999 by Werner Almesberger, EPFL ICA + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/act_api.h> +#include <net/pkt_cls.h> +#include <net/route.h> + + +/* + * Not quite sure if we need all the xchgs Alexey uses when accessing things. + * Can always add them later ... :) + */ + +/* + * Passing parameters to the root seems to be done more awkwardly than really + * necessary. At least, u32 doesn't seem to use such dirty hacks. To be + * verified. FIXME. + */ + +#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */ +#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ + + +#if 1 /* control */ +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +#if 0 /* data */ +#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define D2PRINTK(format,args...) +#endif + + +#define PRIV(tp) ((struct tcindex_data *) (tp)->root) + + +struct tcindex_filter_result { + struct tcf_exts exts; + struct tcf_result res; +}; + +struct tcindex_filter { + u16 key; + struct tcindex_filter_result result; + struct tcindex_filter *next; +}; + + +struct tcindex_data { + struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ + struct tcindex_filter **h; /* imperfect hash; only used if !perfect; + NULL if unused */ + u16 mask; /* AND key with mask */ + int shift; /* shift ANDed key to the right */ + int hash; /* hash table size; 0 if undefined */ + int alloc_hash; /* allocated size */ + int fall_through; /* 0: only classify if explicit match */ +}; + +static struct tcf_ext_map tcindex_ext_map = { + .police = TCA_TCINDEX_POLICE, + .action = TCA_TCINDEX_ACT +}; + +static inline int +tcindex_filter_is_set(struct tcindex_filter_result *r) +{ + return tcf_exts_is_predicative(&r->exts) || r->res.classid; +} + +static struct tcindex_filter_result * +tcindex_lookup(struct tcindex_data *p, u16 key) +{ + struct tcindex_filter *f; + + if (p->perfect) + return tcindex_filter_is_set(p->perfect + key) ? + p->perfect + key : NULL; + else if (p->h) { + for (f = p->h[key % p->hash]; f; f = f->next) + if (f->key == key) + return &f->result; + } + + return NULL; +} + + +static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *f; + int key = (skb->tc_index & p->mask) >> p->shift; + + D2PRINTK("tcindex_classify(skb %p,tp %p,res %p),p %p\n",skb,tp,res,p); + + f = tcindex_lookup(p, key); + if (!f) { + if (!p->fall_through) + return -1; + res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key); + res->class = 0; + D2PRINTK("alg 0x%x\n",res->classid); + return 0; + } + *res = f->res; + D2PRINTK("map 0x%x\n",res->classid); + + return tcf_exts_exec(skb, &f->exts, res); +} + + +static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r; + + DPRINTK("tcindex_get(tp %p,handle 0x%08x)\n",tp,handle); + if (p->perfect && handle >= p->alloc_hash) + return 0; + r = tcindex_lookup(p, handle); + return r && tcindex_filter_is_set(r) ? (unsigned long) r : 0UL; +} + + +static void tcindex_put(struct tcf_proto *tp, unsigned long f) +{ + DPRINTK("tcindex_put(tp %p,f 0x%lx)\n",tp,f); +} + + +static int tcindex_init(struct tcf_proto *tp) +{ + struct tcindex_data *p; + + DPRINTK("tcindex_init(tp %p)\n",tp); + p = kmalloc(sizeof(struct tcindex_data),GFP_KERNEL); + if (!p) + return -ENOMEM; + + memset(p, 0, sizeof(*p)); + p->mask = 0xffff; + p->hash = DEFAULT_HASH_SIZE; + p->fall_through = 1; + + tp->root = p; + return 0; +} + + +static int +__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; + struct tcindex_filter *f = NULL; + + DPRINTK("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n",tp,arg,p,f); + if (p->perfect) { + if (!r->res.class) + return -ENOENT; + } else { + int i; + struct tcindex_filter **walk = NULL; + + for (i = 0; i < p->hash; i++) + for (walk = p->h+i; *walk; walk = &(*walk)->next) + if (&(*walk)->result == r) + goto found; + return -ENOENT; + +found: + f = *walk; + if (lock) + tcf_tree_lock(tp); + *walk = f->next; + if (lock) + tcf_tree_unlock(tp); + } + tcf_unbind_filter(tp, &r->res); + tcf_exts_destroy(tp, &r->exts); + if (f) + kfree(f); + return 0; +} + +static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) +{ + return __tcindex_delete(tp, arg, 1); +} + +static inline int +valid_perfect_hash(struct tcindex_data *p) +{ + return p->hash > (p->mask >> p->shift); +} + +static int +tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, + struct tcindex_data *p, struct tcindex_filter_result *r, + struct rtattr **tb, struct rtattr *est) +{ + int err, balloc = 0; + struct tcindex_filter_result new_filter_result, *old_r = r; + struct tcindex_filter_result cr; + struct tcindex_data cp; + struct tcindex_filter *f = NULL; /* make gcc behave */ + struct tcf_exts e; + + err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map); + if (err < 0) + return err; + + memcpy(&cp, p, sizeof(cp)); + memset(&new_filter_result, 0, sizeof(new_filter_result)); + + if (old_r) + memcpy(&cr, r, sizeof(cr)); + else + memset(&cr, 0, sizeof(cr)); + + err = -EINVAL; + if (tb[TCA_TCINDEX_HASH-1]) { + if (RTA_PAYLOAD(tb[TCA_TCINDEX_HASH-1]) < sizeof(u32)) + goto errout; + cp.hash = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_HASH-1]); + } + + if (tb[TCA_TCINDEX_MASK-1]) { + if (RTA_PAYLOAD(tb[TCA_TCINDEX_MASK-1]) < sizeof(u16)) + goto errout; + cp.mask = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_MASK-1]); + } + + if (tb[TCA_TCINDEX_SHIFT-1]) { + if (RTA_PAYLOAD(tb[TCA_TCINDEX_SHIFT-1]) < sizeof(u16)) + goto errout; + cp.shift = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_SHIFT-1]); + } + + err = -EBUSY; + /* Hash already allocated, make sure that we still meet the + * requirements for the allocated hash. + */ + if (cp.perfect) { + if (!valid_perfect_hash(&cp) || + cp.hash > cp.alloc_hash) + goto errout; + } else if (cp.h && cp.hash != cp.alloc_hash) + goto errout; + + err = -EINVAL; + if (tb[TCA_TCINDEX_FALL_THROUGH-1]) { + if (RTA_PAYLOAD(tb[TCA_TCINDEX_FALL_THROUGH-1]) < sizeof(u32)) + goto errout; + cp.fall_through = + *(u32 *) RTA_DATA(tb[TCA_TCINDEX_FALL_THROUGH-1]); + } + + if (!cp.hash) { + /* Hash not specified, use perfect hash if the upper limit + * of the hashing index is below the threshold. + */ + if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD) + cp.hash = (cp.mask >> cp.shift)+1; + else + cp.hash = DEFAULT_HASH_SIZE; + } + + if (!cp.perfect && !cp.h) + cp.alloc_hash = cp.hash; + + /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) + * but then, we'd fail handles that may become valid after some future + * mask change. While this is extremely unlikely to ever matter, + * the check below is safer (and also more backwards-compatible). + */ + if (cp.perfect || valid_perfect_hash(&cp)) + if (handle >= cp.alloc_hash) + goto errout; + + + err = -ENOMEM; + if (!cp.perfect && !cp.h) { + if (valid_perfect_hash(&cp)) { + cp.perfect = kmalloc(cp.hash * sizeof(*r), GFP_KERNEL); + if (!cp.perfect) + goto errout; + memset(cp.perfect, 0, cp.hash * sizeof(*r)); + balloc = 1; + } else { + cp.h = kmalloc(cp.hash * sizeof(f), GFP_KERNEL); + if (!cp.h) + goto errout; + memset(cp.h, 0, cp.hash * sizeof(f)); + balloc = 2; + } + } + + if (cp.perfect) + r = cp.perfect + handle; + else + r = tcindex_lookup(&cp, handle) ? : &new_filter_result; + + if (r == &new_filter_result) { + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (!f) + goto errout_alloc; + memset(f, 0, sizeof(*f)); + } + + if (tb[TCA_TCINDEX_CLASSID-1]) { + cr.res.classid = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_CLASSID-1]); + tcf_bind_filter(tp, &cr.res, base); + } + + tcf_exts_change(tp, &cr.exts, &e); + + tcf_tree_lock(tp); + if (old_r && old_r != r) + memset(old_r, 0, sizeof(*old_r)); + + memcpy(p, &cp, sizeof(cp)); + memcpy(r, &cr, sizeof(cr)); + + if (r == &new_filter_result) { + struct tcindex_filter **fp; + + f->key = handle; + f->result = new_filter_result; + f->next = NULL; + for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next) + /* nothing */; + *fp = f; + } + tcf_tree_unlock(tp); + + return 0; + +errout_alloc: + if (balloc == 1) + kfree(cp.perfect); + else if (balloc == 2) + kfree(cp.h); +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int +tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle, + struct rtattr **tca, unsigned long *arg) +{ + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_TCINDEX_MAX]; + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; + + DPRINTK("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," + "p %p,r %p,*arg 0x%lx\n", + tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L); + + if (!opt) + return 0; + + if (rtattr_parse_nested(tb, TCA_TCINDEX_MAX, opt) < 0) + return -EINVAL; + + return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE-1]); +} + + +static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter *f,*next; + int i; + + DPRINTK("tcindex_walk(tp %p,walker %p),p %p\n",tp,walker,p); + if (p->perfect) { + for (i = 0; i < p->hash; i++) { + if (!p->perfect[i].res.class) + continue; + if (walker->count >= walker->skip) { + if (walker->fn(tp, + (unsigned long) (p->perfect+i), walker) + < 0) { + walker->stop = 1; + return; + } + } + walker->count++; + } + } + if (!p->h) + return; + for (i = 0; i < p->hash; i++) { + for (f = p->h[i]; f; f = next) { + next = f->next; + if (walker->count >= walker->skip) { + if (walker->fn(tp,(unsigned long) &f->result, + walker) < 0) { + walker->stop = 1; + return; + } + } + walker->count++; + } + } +} + + +static int tcindex_destroy_element(struct tcf_proto *tp, + unsigned long arg, struct tcf_walker *walker) +{ + return __tcindex_delete(tp, arg, 0); +} + + +static void tcindex_destroy(struct tcf_proto *tp) +{ + struct tcindex_data *p = PRIV(tp); + struct tcf_walker walker; + + DPRINTK("tcindex_destroy(tp %p),p %p\n",tp,p); + walker.count = 0; + walker.skip = 0; + walker.fn = &tcindex_destroy_element; + tcindex_walk(tp,&walker); + if (p->perfect) + kfree(p->perfect); + if (p->h) + kfree(p->h); + kfree(p); + tp->root = NULL; +} + + +static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + + DPRINTK("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n", + tp,fh,skb,t,p,r,b); + DPRINTK("p->perfect %p p->h %p\n",p->perfect,p->h); + rta = (struct rtattr *) b; + RTA_PUT(skb,TCA_OPTIONS,0,NULL); + if (!fh) { + t->tcm_handle = ~0; /* whatever ... */ + RTA_PUT(skb,TCA_TCINDEX_HASH,sizeof(p->hash),&p->hash); + RTA_PUT(skb,TCA_TCINDEX_MASK,sizeof(p->mask),&p->mask); + RTA_PUT(skb,TCA_TCINDEX_SHIFT,sizeof(p->shift),&p->shift); + RTA_PUT(skb,TCA_TCINDEX_FALL_THROUGH,sizeof(p->fall_through), + &p->fall_through); + rta->rta_len = skb->tail-b; + } else { + if (p->perfect) { + t->tcm_handle = r-p->perfect; + } else { + struct tcindex_filter *f; + int i; + + t->tcm_handle = 0; + for (i = 0; !t->tcm_handle && i < p->hash; i++) { + for (f = p->h[i]; !t->tcm_handle && f; + f = f->next) { + if (&f->result == r) + t->tcm_handle = f->key; + } + } + } + DPRINTK("handle = %d\n",t->tcm_handle); + if (r->res.class) + RTA_PUT(skb, TCA_TCINDEX_CLASSID, 4, &r->res.classid); + + if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail-b; + + if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0) + goto rtattr_failure; + } + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tcf_proto_ops cls_tcindex_ops = { + .next = NULL, + .kind = "tcindex", + .classify = tcindex_classify, + .init = tcindex_init, + .destroy = tcindex_destroy, + .get = tcindex_get, + .put = tcindex_put, + .change = tcindex_change, + .delete = tcindex_delete, + .walk = tcindex_walk, + .dump = tcindex_dump, + .owner = THIS_MODULE, +}; + +static int __init init_tcindex(void) +{ + return register_tcf_proto_ops(&cls_tcindex_ops); +} + +static void __exit exit_tcindex(void) +{ + unregister_tcf_proto_ops(&cls_tcindex_ops); +} + +module_init(init_tcindex) +module_exit(exit_tcindex) +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c new file mode 100644 index 00000000000..364b87d8645 --- /dev/null +++ b/net/sched/cls_u32.c @@ -0,0 +1,828 @@ +/* + * net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * The filters are packed to hash tables of key nodes + * with a set of 32bit key/mask pairs at every node. + * Nodes reference next level hash tables etc. + * + * This scheme is the best universal classifier I managed to + * invent; it is not super-fast, but it is not slow (provided you + * program it correctly), and general enough. And its relative + * speed grows as the number of rules becomes larger. + * + * It seems that it represents the best middle point between + * speed and manageability both by human and by machine. + * + * It is especially useful for link sharing combined with QoS; + * pure RSVP doesn't need such a general approach and can use + * much simpler (and faster) schemes, sort of cls_rsvp.c. + * + * JHS: We should remove the CONFIG_NET_CLS_IND from here + * eventually when the meta match extension is made available + * + * nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro> + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/rtnetlink.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/act_api.h> +#include <net/pkt_cls.h> + +struct tc_u_knode +{ + struct tc_u_knode *next; + u32 handle; + struct tc_u_hnode *ht_up; + struct tcf_exts exts; +#ifdef CONFIG_NET_CLS_IND + char indev[IFNAMSIZ]; +#endif + u8 fshift; + struct tcf_result res; + struct tc_u_hnode *ht_down; +#ifdef CONFIG_CLS_U32_PERF + struct tc_u32_pcnt *pf; +#endif +#ifdef CONFIG_CLS_U32_MARK + struct tc_u32_mark mark; +#endif + struct tc_u32_sel sel; +}; + +struct tc_u_hnode +{ + struct tc_u_hnode *next; + u32 handle; + u32 prio; + struct tc_u_common *tp_c; + int refcnt; + unsigned divisor; + struct tc_u_knode *ht[1]; +}; + +struct tc_u_common +{ + struct tc_u_common *next; + struct tc_u_hnode *hlist; + struct Qdisc *q; + int refcnt; + u32 hgenerator; +}; + +static struct tcf_ext_map u32_ext_map = { + .action = TCA_U32_ACT, + .police = TCA_U32_POLICE +}; + +static struct tc_u_common *u32_list; + +static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel, u8 fshift) +{ + unsigned h = (key & sel->hmask)>>fshift; + + return h; +} + +static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) +{ + struct { + struct tc_u_knode *knode; + u8 *ptr; + } stack[TC_U32_MAXDEPTH]; + + struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root; + u8 *ptr = skb->nh.raw; + struct tc_u_knode *n; + int sdepth = 0; + int off2 = 0; + int sel = 0; +#ifdef CONFIG_CLS_U32_PERF + int j; +#endif + int i, r; + +next_ht: + n = ht->ht[sel]; + +next_knode: + if (n) { + struct tc_u32_key *key = n->sel.keys; + +#ifdef CONFIG_CLS_U32_PERF + n->pf->rcnt +=1; + j = 0; +#endif + +#ifdef CONFIG_CLS_U32_MARK + if ((skb->nfmark & n->mark.mask) != n->mark.val) { + n = n->next; + goto next_knode; + } else { + n->mark.success++; + } +#endif + + for (i = n->sel.nkeys; i>0; i--, key++) { + + if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) { + n = n->next; + goto next_knode; + } +#ifdef CONFIG_CLS_U32_PERF + n->pf->kcnts[j] +=1; + j++; +#endif + } + if (n->ht_down == NULL) { +check_terminal: + if (n->sel.flags&TC_U32_TERMINAL) { + + *res = n->res; +#ifdef CONFIG_NET_CLS_IND + if (!tcf_match_indev(skb, n->indev)) { + n = n->next; + goto next_knode; + } +#endif +#ifdef CONFIG_CLS_U32_PERF + n->pf->rhit +=1; +#endif + r = tcf_exts_exec(skb, &n->exts, res); + if (r < 0) { + n = n->next; + goto next_knode; + } + + return r; + } + n = n->next; + goto next_knode; + } + + /* PUSH */ + if (sdepth >= TC_U32_MAXDEPTH) + goto deadloop; + stack[sdepth].knode = n; + stack[sdepth].ptr = ptr; + sdepth++; + + ht = n->ht_down; + sel = 0; + if (ht->divisor) + sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel,n->fshift); + + if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT))) + goto next_ht; + + if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) { + off2 = n->sel.off + 3; + if (n->sel.flags&TC_U32_VAROFFSET) + off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift; + off2 &= ~3; + } + if (n->sel.flags&TC_U32_EAT) { + ptr += off2; + off2 = 0; + } + + if (ptr < skb->tail) + goto next_ht; + } + + /* POP */ + if (sdepth--) { + n = stack[sdepth].knode; + ht = n->ht_up; + ptr = stack[sdepth].ptr; + goto check_terminal; + } + return -1; + +deadloop: + if (net_ratelimit()) + printk("cls_u32: dead loop\n"); + return -1; +} + +static __inline__ struct tc_u_hnode * +u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) +{ + struct tc_u_hnode *ht; + + for (ht = tp_c->hlist; ht; ht = ht->next) + if (ht->handle == handle) + break; + + return ht; +} + +static __inline__ struct tc_u_knode * +u32_lookup_key(struct tc_u_hnode *ht, u32 handle) +{ + unsigned sel; + struct tc_u_knode *n = NULL; + + sel = TC_U32_HASH(handle); + if (sel > ht->divisor) + goto out; + + for (n = ht->ht[sel]; n; n = n->next) + if (n->handle == handle) + break; +out: + return n; +} + + +static unsigned long u32_get(struct tcf_proto *tp, u32 handle) +{ + struct tc_u_hnode *ht; + struct tc_u_common *tp_c = tp->data; + + if (TC_U32_HTID(handle) == TC_U32_ROOT) + ht = tp->root; + else + ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle)); + + if (!ht) + return 0; + + if (TC_U32_KEY(handle) == 0) + return (unsigned long)ht; + + return (unsigned long)u32_lookup_key(ht, handle); +} + +static void u32_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static u32 gen_new_htid(struct tc_u_common *tp_c) +{ + int i = 0x800; + + do { + if (++tp_c->hgenerator == 0x7FF) + tp_c->hgenerator = 1; + } while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); + + return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; +} + +static int u32_init(struct tcf_proto *tp) +{ + struct tc_u_hnode *root_ht; + struct tc_u_common *tp_c; + + for (tp_c = u32_list; tp_c; tp_c = tp_c->next) + if (tp_c->q == tp->q) + break; + + root_ht = kmalloc(sizeof(*root_ht), GFP_KERNEL); + if (root_ht == NULL) + return -ENOBUFS; + + memset(root_ht, 0, sizeof(*root_ht)); + root_ht->divisor = 0; + root_ht->refcnt++; + root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; + root_ht->prio = tp->prio; + + if (tp_c == NULL) { + tp_c = kmalloc(sizeof(*tp_c), GFP_KERNEL); + if (tp_c == NULL) { + kfree(root_ht); + return -ENOBUFS; + } + memset(tp_c, 0, sizeof(*tp_c)); + tp_c->q = tp->q; + tp_c->next = u32_list; + u32_list = tp_c; + } + + tp_c->refcnt++; + root_ht->next = tp_c->hlist; + tp_c->hlist = root_ht; + root_ht->tp_c = tp_c; + + tp->root = root_ht; + tp->data = tp_c; + return 0; +} + +static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) +{ + tcf_unbind_filter(tp, &n->res); + tcf_exts_destroy(tp, &n->exts); + if (n->ht_down) + n->ht_down->refcnt--; +#ifdef CONFIG_CLS_U32_PERF + if (n && (NULL != n->pf)) + kfree(n->pf); +#endif + kfree(n); + return 0; +} + +static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) +{ + struct tc_u_knode **kp; + struct tc_u_hnode *ht = key->ht_up; + + if (ht) { + for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) { + if (*kp == key) { + tcf_tree_lock(tp); + *kp = key->next; + tcf_tree_unlock(tp); + + u32_destroy_key(tp, key); + return 0; + } + } + } + BUG_TRAP(0); + return 0; +} + +static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +{ + struct tc_u_knode *n; + unsigned h; + + for (h=0; h<=ht->divisor; h++) { + while ((n = ht->ht[h]) != NULL) { + ht->ht[h] = n->next; + + u32_destroy_key(tp, n); + } + } +} + +static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode **hn; + + BUG_TRAP(!ht->refcnt); + + u32_clear_hnode(tp, ht); + + for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) { + if (*hn == ht) { + *hn = ht->next; + kfree(ht); + return 0; + } + } + + BUG_TRAP(0); + return -ENOENT; +} + +static void u32_destroy(struct tcf_proto *tp) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *root_ht = xchg(&tp->root, NULL); + + BUG_TRAP(root_ht != NULL); + + if (root_ht && --root_ht->refcnt == 0) + u32_destroy_hnode(tp, root_ht); + + if (--tp_c->refcnt == 0) { + struct tc_u_hnode *ht; + struct tc_u_common **tp_cp; + + for (tp_cp = &u32_list; *tp_cp; tp_cp = &(*tp_cp)->next) { + if (*tp_cp == tp_c) { + *tp_cp = tp_c->next; + break; + } + } + + for (ht=tp_c->hlist; ht; ht = ht->next) + u32_clear_hnode(tp, ht); + + while ((ht = tp_c->hlist) != NULL) { + tp_c->hlist = ht->next; + + BUG_TRAP(ht->refcnt == 0); + + kfree(ht); + }; + + kfree(tp_c); + } + + tp->data = NULL; +} + +static int u32_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct tc_u_hnode *ht = (struct tc_u_hnode*)arg; + + if (ht == NULL) + return 0; + + if (TC_U32_KEY(ht->handle)) + return u32_delete_key(tp, (struct tc_u_knode*)ht); + + if (tp->root == ht) + return -EINVAL; + + if (--ht->refcnt == 0) + u32_destroy_hnode(tp, ht); + + return 0; +} + +static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) +{ + struct tc_u_knode *n; + unsigned i = 0x7FF; + + for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next) + if (i < TC_U32_NODE(n->handle)) + i = TC_U32_NODE(n->handle); + i++; + + return handle|(i>0xFFF ? 0xFFF : i); +} + +static int u32_set_parms(struct tcf_proto *tp, unsigned long base, + struct tc_u_hnode *ht, + struct tc_u_knode *n, struct rtattr **tb, + struct rtattr *est) +{ + int err; + struct tcf_exts e; + + err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_U32_LINK-1]) { + u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]); + struct tc_u_hnode *ht_down = NULL; + + if (TC_U32_KEY(handle)) + goto errout; + + if (handle) { + ht_down = u32_lookup_ht(ht->tp_c, handle); + + if (ht_down == NULL) + goto errout; + ht_down->refcnt++; + } + + tcf_tree_lock(tp); + ht_down = xchg(&n->ht_down, ht_down); + tcf_tree_unlock(tp); + + if (ht_down) + ht_down->refcnt--; + } + if (tb[TCA_U32_CLASSID-1]) { + n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]); + tcf_bind_filter(tp, &n->res, base); + } + +#ifdef CONFIG_NET_CLS_IND + if (tb[TCA_U32_INDEV-1]) { + int err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV-1]); + if (err < 0) + goto errout; + } +#endif + tcf_exts_change(tp, &n->exts, &e); + + return 0; +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + struct tc_u32_sel *s; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_U32_MAX]; + u32 htid; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + if (rtattr_parse_nested(tb, TCA_U32_MAX, opt) < 0) + return -EINVAL; + + if ((n = (struct tc_u_knode*)*arg) != NULL) { + if (TC_U32_KEY(n->handle) == 0) + return -EINVAL; + + return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE-1]); + } + + if (tb[TCA_U32_DIVISOR-1]) { + unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]); + + if (--divisor > 0x100) + return -EINVAL; + if (TC_U32_KEY(handle)) + return -EINVAL; + if (handle == 0) { + handle = gen_new_htid(tp->data); + if (handle == 0) + return -ENOMEM; + } + ht = kmalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL); + if (ht == NULL) + return -ENOBUFS; + memset(ht, 0, sizeof(*ht) + divisor*sizeof(void*)); + ht->tp_c = tp_c; + ht->refcnt = 0; + ht->divisor = divisor; + ht->handle = handle; + ht->prio = tp->prio; + ht->next = tp_c->hlist; + tp_c->hlist = ht; + *arg = (unsigned long)ht; + return 0; + } + + if (tb[TCA_U32_HASH-1]) { + htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]); + if (TC_U32_HTID(htid) == TC_U32_ROOT) { + ht = tp->root; + htid = ht->handle; + } else { + ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); + if (ht == NULL) + return -EINVAL; + } + } else { + ht = tp->root; + htid = ht->handle; + } + + if (ht->divisor < TC_U32_HASH(htid)) + return -EINVAL; + + if (handle) { + if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) + return -EINVAL; + handle = htid | TC_U32_NODE(handle); + } else + handle = gen_new_kid(ht, htid); + + if (tb[TCA_U32_SEL-1] == 0 || + RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel)) + return -EINVAL; + + s = RTA_DATA(tb[TCA_U32_SEL-1]); + + n = kmalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); + if (n == NULL) + return -ENOBUFS; + + memset(n, 0, sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key)); +#ifdef CONFIG_CLS_U32_PERF + n->pf = kmalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL); + if (n->pf == NULL) { + kfree(n); + return -ENOBUFS; + } + memset(n->pf, 0, sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64)); +#endif + + memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); + n->ht_up = ht; + n->handle = handle; +{ + u8 i = 0; + u32 mask = s->hmask; + if (mask) { + while (!(mask & 1)) { + i++; + mask>>=1; + } + } + n->fshift = i; +} + +#ifdef CONFIG_CLS_U32_MARK + if (tb[TCA_U32_MARK-1]) { + struct tc_u32_mark *mark; + + if (RTA_PAYLOAD(tb[TCA_U32_MARK-1]) < sizeof(struct tc_u32_mark)) { +#ifdef CONFIG_CLS_U32_PERF + kfree(n->pf); +#endif + kfree(n); + return -EINVAL; + } + mark = RTA_DATA(tb[TCA_U32_MARK-1]); + memcpy(&n->mark, mark, sizeof(struct tc_u32_mark)); + n->mark.success = 0; + } +#endif + + err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE-1]); + if (err == 0) { + struct tc_u_knode **ins; + for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) + if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle)) + break; + + n->next = *ins; + wmb(); + *ins = n; + + *arg = (unsigned long)n; + return 0; + } +#ifdef CONFIG_CLS_U32_PERF + if (n && (NULL != n->pf)) + kfree(n->pf); +#endif + kfree(n); + return err; +} + +static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + unsigned h; + + if (arg->stop) + return; + + for (ht = tp_c->hlist; ht; ht = ht->next) { + if (ht->prio != tp->prio) + continue; + if (arg->count >= arg->skip) { + if (arg->fn(tp, (unsigned long)ht, arg) < 0) { + arg->stop = 1; + return; + } + } + arg->count++; + for (h = 0; h <= ht->divisor; h++) { + for (n = ht->ht[h]; n; n = n->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)n, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } + } +} + +static int u32_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct tc_u_knode *n = (struct tc_u_knode*)fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (n == NULL) + return skb->len; + + t->tcm_handle = n->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + if (TC_U32_KEY(n->handle) == 0) { + struct tc_u_hnode *ht = (struct tc_u_hnode*)fh; + u32 divisor = ht->divisor+1; + RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor); + } else { + RTA_PUT(skb, TCA_U32_SEL, + sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), + &n->sel); + if (n->ht_up) { + u32 htid = n->handle & 0xFFFFF000; + RTA_PUT(skb, TCA_U32_HASH, 4, &htid); + } + if (n->res.classid) + RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid); + if (n->ht_down) + RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle); + +#ifdef CONFIG_CLS_U32_MARK + if (n->mark.val || n->mark.mask) + RTA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark); +#endif + + if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0) + goto rtattr_failure; + +#ifdef CONFIG_NET_CLS_IND + if(strlen(n->indev)) + RTA_PUT(skb, TCA_U32_INDEV, IFNAMSIZ, n->indev); +#endif +#ifdef CONFIG_CLS_U32_PERF + RTA_PUT(skb, TCA_U32_PCNT, + sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), + n->pf); +#endif + } + + rta->rta_len = skb->tail - b; + if (TC_U32_KEY(n->handle)) + if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0) + goto rtattr_failure; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tcf_proto_ops cls_u32_ops = { + .next = NULL, + .kind = "u32", + .classify = u32_classify, + .init = u32_init, + .destroy = u32_destroy, + .get = u32_get, + .put = u32_put, + .change = u32_change, + .delete = u32_delete, + .walk = u32_walk, + .dump = u32_dump, + .owner = THIS_MODULE, +}; + +static int __init init_u32(void) +{ + printk("u32 classifier\n"); +#ifdef CONFIG_CLS_U32_PERF + printk(" Perfomance counters on\n"); +#endif +#ifdef CONFIG_NET_CLS_POLICE + printk(" OLD policer on \n"); +#endif +#ifdef CONFIG_NET_CLS_IND + printk(" input device check on \n"); +#endif +#ifdef CONFIG_NET_CLS_ACT + printk(" Actions configured \n"); +#endif + return register_tcf_proto_ops(&cls_u32_ops); +} + +static void __exit exit_u32(void) +{ + unregister_tcf_proto_ops(&cls_u32_ops); +} + +module_init(init_u32) +module_exit(exit_u32) +MODULE_LICENSE("GPL"); diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c new file mode 100644 index 00000000000..bf1f00f8b1b --- /dev/null +++ b/net/sched/em_cmp.c @@ -0,0 +1,101 @@ +/* + * net/sched/em_cmp.c Simple packet data comparison ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/tc_ematch/tc_em_cmp.h> +#include <net/pkt_cls.h> + +static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp) +{ + return unlikely(cmp->flags & TCF_EM_CMP_TRANS); +} + +static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data; + unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off; + u32 val = 0; + + if (!tcf_valid_offset(skb, ptr, cmp->align)) + return 0; + + switch (cmp->align) { + case TCF_EM_ALIGN_U8: + val = *ptr; + break; + + case TCF_EM_ALIGN_U16: + val = *ptr << 8; + val |= *(ptr+1); + + if (cmp_needs_transformation(cmp)) + val = be16_to_cpu(val); + break; + + case TCF_EM_ALIGN_U32: + /* Worth checking boundries? The branching seems + * to get worse. Visit again. */ + val = *ptr << 24; + val |= *(ptr+1) << 16; + val |= *(ptr+2) << 8; + val |= *(ptr+3); + + if (cmp_needs_transformation(cmp)) + val = be32_to_cpu(val); + break; + + default: + return 0; + } + + if (cmp->mask) + val &= cmp->mask; + + switch (cmp->opnd) { + case TCF_EM_OPND_EQ: + return val == cmp->val; + case TCF_EM_OPND_LT: + return val < cmp->val; + case TCF_EM_OPND_GT: + return val > cmp->val; + } + + return 0; +} + +static struct tcf_ematch_ops em_cmp_ops = { + .kind = TCF_EM_CMP, + .datalen = sizeof(struct tcf_em_cmp), + .match = em_cmp_match, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_cmp_ops.link) +}; + +static int __init init_em_cmp(void) +{ + return tcf_em_register(&em_cmp_ops); +} + +static void __exit exit_em_cmp(void) +{ + tcf_em_unregister(&em_cmp_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_cmp); +module_exit(exit_em_cmp); + diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c new file mode 100644 index 00000000000..f1eeaf65cee --- /dev/null +++ b/net/sched/em_meta.c @@ -0,0 +1,661 @@ +/* + * net/sched/em_meta.c Metadata ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + * + * ========================================================================== + * + * The metadata ematch compares two meta objects where each object + * represents either a meta value stored in the kernel or a static + * value provided by userspace. The objects are not provided by + * userspace itself but rather a definition providing the information + * to build them. Every object is of a certain type which must be + * equal to the object it is being compared to. + * + * The definition of a objects conists of the type (meta type), a + * identifier (meta id) and additional type specific information. + * The meta id is either TCF_META_TYPE_VALUE for values provided by + * userspace or a index to the meta operations table consisting of + * function pointers to type specific meta data collectors returning + * the value of the requested meta value. + * + * lvalue rvalue + * +-----------+ +-----------+ + * | type: INT | | type: INT | + * def | id: INDEV | | id: VALUE | + * | data: | | data: 3 | + * +-----------+ +-----------+ + * | | + * ---> meta_ops[INT][INDEV](...) | + * | | + * ----------- | + * V V + * +-----------+ +-----------+ + * | type: INT | | type: INT | + * obj | id: INDEV | | id: VALUE | + * | data: 2 |<--data got filled out | data: 3 | + * +-----------+ +-----------+ + * | | + * --------------> 2 equals 3 <-------------- + * + * This is a simplified schema, the complexity varies depending + * on the meta type. Obviously, the length of the data must also + * be provided for non-numeric types. + * + * Additionaly, type dependant modifiers such as shift operators + * or mask may be applied to extend the functionaliy. As of now, + * the variable length type supports shifting the byte string to + * the right, eating up any number of octets and thus supporting + * wildcard interface name comparisons such as "ppp%" matching + * ppp0..9. + * + * NOTE: Certain meta values depend on other subsystems and are + * only available if that subsytem is enabled in the kernel. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/random.h> +#include <linux/tc_ematch/tc_em_meta.h> +#include <net/dst.h> +#include <net/route.h> +#include <net/pkt_cls.h> + +struct meta_obj +{ + unsigned long value; + unsigned int len; +}; + +struct meta_value +{ + struct tcf_meta_val hdr; + unsigned long val; + unsigned int len; +}; + +struct meta_match +{ + struct meta_value lvalue; + struct meta_value rvalue; +}; + +static inline int meta_id(struct meta_value *v) +{ + return TCF_META_ID(v->hdr.kind); +} + +static inline int meta_type(struct meta_value *v) +{ + return TCF_META_TYPE(v->hdr.kind); +} + +#define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \ + struct tcf_pkt_info *info, struct meta_value *v, \ + struct meta_obj *dst, int *err) + +/************************************************************************** + * System status & misc + **************************************************************************/ + +META_COLLECTOR(int_random) +{ + get_random_bytes(&dst->value, sizeof(dst->value)); +} + +static inline unsigned long fixed_loadavg(int load) +{ + int rnd_load = load + (FIXED_1/200); + int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT; + + return ((rnd_load >> FSHIFT) * 100) + rnd_frac; +} + +META_COLLECTOR(int_loadavg_0) +{ + dst->value = fixed_loadavg(avenrun[0]); +} + +META_COLLECTOR(int_loadavg_1) +{ + dst->value = fixed_loadavg(avenrun[1]); +} + +META_COLLECTOR(int_loadavg_2) +{ + dst->value = fixed_loadavg(avenrun[2]); +} + +/************************************************************************** + * Device names & indices + **************************************************************************/ + +static inline int int_dev(struct net_device *dev, struct meta_obj *dst) +{ + if (unlikely(dev == NULL)) + return -1; + + dst->value = dev->ifindex; + return 0; +} + +static inline int var_dev(struct net_device *dev, struct meta_obj *dst) +{ + if (unlikely(dev == NULL)) + return -1; + + dst->value = (unsigned long) dev->name; + dst->len = strlen(dev->name); + return 0; +} + +META_COLLECTOR(int_dev) +{ + *err = int_dev(skb->dev, dst); +} + +META_COLLECTOR(var_dev) +{ + *err = var_dev(skb->dev, dst); +} + +META_COLLECTOR(int_indev) +{ + *err = int_dev(skb->input_dev, dst); +} + +META_COLLECTOR(var_indev) +{ + *err = var_dev(skb->input_dev, dst); +} + +META_COLLECTOR(int_realdev) +{ + *err = int_dev(skb->real_dev, dst); +} + +META_COLLECTOR(var_realdev) +{ + *err = var_dev(skb->real_dev, dst); +} + +/************************************************************************** + * skb attributes + **************************************************************************/ + +META_COLLECTOR(int_priority) +{ + dst->value = skb->priority; +} + +META_COLLECTOR(int_protocol) +{ + /* Let userspace take care of the byte ordering */ + dst->value = skb->protocol; +} + +META_COLLECTOR(int_security) +{ + dst->value = skb->security; +} + +META_COLLECTOR(int_pkttype) +{ + dst->value = skb->pkt_type; +} + +META_COLLECTOR(int_pktlen) +{ + dst->value = skb->len; +} + +META_COLLECTOR(int_datalen) +{ + dst->value = skb->data_len; +} + +META_COLLECTOR(int_maclen) +{ + dst->value = skb->mac_len; +} + +/************************************************************************** + * Netfilter + **************************************************************************/ + +#ifdef CONFIG_NETFILTER +META_COLLECTOR(int_nfmark) +{ + dst->value = skb->nfmark; +} +#endif + +/************************************************************************** + * Traffic Control + **************************************************************************/ + +META_COLLECTOR(int_tcindex) +{ + dst->value = skb->tc_index; +} + +#ifdef CONFIG_NET_CLS_ACT +META_COLLECTOR(int_tcverd) +{ + dst->value = skb->tc_verd; +} + +META_COLLECTOR(int_tcclassid) +{ + dst->value = skb->tc_classid; +} +#endif + +/************************************************************************** + * Routing + **************************************************************************/ + +#ifdef CONFIG_NET_CLS_ROUTE +META_COLLECTOR(int_rtclassid) +{ + if (unlikely(skb->dst == NULL)) + *err = -1; + else + dst->value = skb->dst->tclassid; +} +#endif + +META_COLLECTOR(int_rtiif) +{ + if (unlikely(skb->dst == NULL)) + *err = -1; + else + dst->value = ((struct rtable*) skb->dst)->fl.iif; +} + +/************************************************************************** + * Meta value collectors assignment table + **************************************************************************/ + +struct meta_ops +{ + void (*get)(struct sk_buff *, struct tcf_pkt_info *, + struct meta_value *, struct meta_obj *, int *); +}; + +/* Meta value operations table listing all meta value collectors and + * assigns them to a type and meta id. */ +static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { + [TCF_META_TYPE_VAR] = { + [TCF_META_ID_DEV] = { .get = meta_var_dev }, + [TCF_META_ID_INDEV] = { .get = meta_var_indev }, + [TCF_META_ID_REALDEV] = { .get = meta_var_realdev } + }, + [TCF_META_TYPE_INT] = { + [TCF_META_ID_RANDOM] = { .get = meta_int_random }, + [TCF_META_ID_LOADAVG_0] = { .get = meta_int_loadavg_0 }, + [TCF_META_ID_LOADAVG_1] = { .get = meta_int_loadavg_1 }, + [TCF_META_ID_LOADAVG_2] = { .get = meta_int_loadavg_2 }, + [TCF_META_ID_DEV] = { .get = meta_int_dev }, + [TCF_META_ID_INDEV] = { .get = meta_int_indev }, + [TCF_META_ID_REALDEV] = { .get = meta_int_realdev }, + [TCF_META_ID_PRIORITY] = { .get = meta_int_priority }, + [TCF_META_ID_PROTOCOL] = { .get = meta_int_protocol }, + [TCF_META_ID_SECURITY] = { .get = meta_int_security }, + [TCF_META_ID_PKTTYPE] = { .get = meta_int_pkttype }, + [TCF_META_ID_PKTLEN] = { .get = meta_int_pktlen }, + [TCF_META_ID_DATALEN] = { .get = meta_int_datalen }, + [TCF_META_ID_MACLEN] = { .get = meta_int_maclen }, +#ifdef CONFIG_NETFILTER + [TCF_META_ID_NFMARK] = { .get = meta_int_nfmark }, +#endif + [TCF_META_ID_TCINDEX] = { .get = meta_int_tcindex }, +#ifdef CONFIG_NET_CLS_ACT + [TCF_META_ID_TCVERDICT] = { .get = meta_int_tcverd }, + [TCF_META_ID_TCCLASSID] = { .get = meta_int_tcclassid }, +#endif +#ifdef CONFIG_NET_CLS_ROUTE + [TCF_META_ID_RTCLASSID] = { .get = meta_int_rtclassid }, +#endif + [TCF_META_ID_RTIIF] = { .get = meta_int_rtiif } + } +}; + +static inline struct meta_ops * meta_ops(struct meta_value *val) +{ + return &__meta_ops[meta_type(val)][meta_id(val)]; +} + +/************************************************************************** + * Type specific operations for TCF_META_TYPE_VAR + **************************************************************************/ + +static int meta_var_compare(struct meta_obj *a, struct meta_obj *b) +{ + int r = a->len - b->len; + + if (r == 0) + r = memcmp((void *) a->value, (void *) b->value, a->len); + + return r; +} + +static int meta_var_change(struct meta_value *dst, struct rtattr *rta) +{ + int len = RTA_PAYLOAD(rta); + + dst->val = (unsigned long) kmalloc(len, GFP_KERNEL); + if (dst->val == 0UL) + return -ENOMEM; + memcpy((void *) dst->val, RTA_DATA(rta), len); + dst->len = len; + return 0; +} + +static void meta_var_destroy(struct meta_value *v) +{ + if (v->val) + kfree((void *) v->val); +} + +static void meta_var_apply_extras(struct meta_value *v, + struct meta_obj *dst) +{ + int shift = v->hdr.shift; + + if (shift && shift < dst->len) + dst->len -= shift; +} + +static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv) +{ + if (v->val && v->len) + RTA_PUT(skb, tlv, v->len, (void *) v->val); + return 0; + +rtattr_failure: + return -1; +} + +/************************************************************************** + * Type specific operations for TCF_META_TYPE_INT + **************************************************************************/ + +static int meta_int_compare(struct meta_obj *a, struct meta_obj *b) +{ + /* Let gcc optimize it, the unlikely is not really based on + * some numbers but jump free code for mismatches seems + * more logical. */ + if (unlikely(a == b)) + return 0; + else if (a < b) + return -1; + else + return 1; +} + +static int meta_int_change(struct meta_value *dst, struct rtattr *rta) +{ + if (RTA_PAYLOAD(rta) >= sizeof(unsigned long)) { + dst->val = *(unsigned long *) RTA_DATA(rta); + dst->len = sizeof(unsigned long); + } else if (RTA_PAYLOAD(rta) == sizeof(u32)) { + dst->val = *(u32 *) RTA_DATA(rta); + dst->len = sizeof(u32); + } else + return -EINVAL; + + return 0; +} + +static void meta_int_apply_extras(struct meta_value *v, + struct meta_obj *dst) +{ + if (v->hdr.shift) + dst->value >>= v->hdr.shift; + + if (v->val) + dst->value &= v->val; +} + +static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv) +{ + if (v->len == sizeof(unsigned long)) + RTA_PUT(skb, tlv, sizeof(unsigned long), &v->val); + else if (v->len == sizeof(u32)) { + u32 d = v->val; + RTA_PUT(skb, tlv, sizeof(d), &d); + } + + return 0; + +rtattr_failure: + return -1; +} + +/************************************************************************** + * Type specific operations table + **************************************************************************/ + +struct meta_type_ops +{ + void (*destroy)(struct meta_value *); + int (*compare)(struct meta_obj *, struct meta_obj *); + int (*change)(struct meta_value *, struct rtattr *); + void (*apply_extras)(struct meta_value *, struct meta_obj *); + int (*dump)(struct sk_buff *, struct meta_value *, int); +}; + +static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = { + [TCF_META_TYPE_VAR] = { + .destroy = meta_var_destroy, + .compare = meta_var_compare, + .change = meta_var_change, + .apply_extras = meta_var_apply_extras, + .dump = meta_var_dump + }, + [TCF_META_TYPE_INT] = { + .compare = meta_int_compare, + .change = meta_int_change, + .apply_extras = meta_int_apply_extras, + .dump = meta_int_dump + } +}; + +static inline struct meta_type_ops * meta_type_ops(struct meta_value *v) +{ + return &__meta_type_ops[meta_type(v)]; +} + +/************************************************************************** + * Core + **************************************************************************/ + +static inline int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info, + struct meta_value *v, struct meta_obj *dst) +{ + int err = 0; + + if (meta_id(v) == TCF_META_ID_VALUE) { + dst->value = v->val; + dst->len = v->len; + return 0; + } + + meta_ops(v)->get(skb, info, v, dst, &err); + if (err < 0) + return err; + + if (meta_type_ops(v)->apply_extras) + meta_type_ops(v)->apply_extras(v, dst); + + return 0; +} + +static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m, + struct tcf_pkt_info *info) +{ + int r; + struct meta_match *meta = (struct meta_match *) m->data; + struct meta_obj l_value, r_value; + + if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 || + meta_get(skb, info, &meta->rvalue, &r_value) < 0) + return 0; + + r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value); + + switch (meta->lvalue.hdr.op) { + case TCF_EM_OPND_EQ: + return !r; + case TCF_EM_OPND_LT: + return r < 0; + case TCF_EM_OPND_GT: + return r > 0; + } + + return 0; +} + +static inline void meta_delete(struct meta_match *meta) +{ + struct meta_type_ops *ops = meta_type_ops(&meta->lvalue); + + if (ops && ops->destroy) { + ops->destroy(&meta->lvalue); + ops->destroy(&meta->rvalue); + } + + kfree(meta); +} + +static inline int meta_change_data(struct meta_value *dst, struct rtattr *rta) +{ + if (rta) { + if (RTA_PAYLOAD(rta) == 0) + return -EINVAL; + + return meta_type_ops(dst)->change(dst, rta); + } + + return 0; +} + +static inline int meta_is_supported(struct meta_value *val) +{ + return (!meta_id(val) || meta_ops(val)->get); +} + +static int em_meta_change(struct tcf_proto *tp, void *data, int len, + struct tcf_ematch *m) +{ + int err = -EINVAL; + struct rtattr *tb[TCA_EM_META_MAX]; + struct tcf_meta_hdr *hdr; + struct meta_match *meta = NULL; + + if (rtattr_parse(tb, TCA_EM_META_MAX, data, len) < 0) + goto errout; + + if (tb[TCA_EM_META_HDR-1] == NULL || + RTA_PAYLOAD(tb[TCA_EM_META_HDR-1]) < sizeof(*hdr)) + goto errout; + hdr = RTA_DATA(tb[TCA_EM_META_HDR-1]); + + if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) || + TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX || + TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX || + TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX) + goto errout; + + meta = kmalloc(sizeof(*meta), GFP_KERNEL); + if (meta == NULL) + goto errout; + memset(meta, 0, sizeof(*meta)); + + memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left)); + memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right)); + + if (!meta_is_supported(&meta->lvalue) || + !meta_is_supported(&meta->rvalue)) { + err = -EOPNOTSUPP; + goto errout; + } + + if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE-1]) < 0 || + meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE-1]) < 0) + goto errout; + + m->datalen = sizeof(*meta); + m->data = (unsigned long) meta; + + err = 0; +errout: + if (err && meta) + meta_delete(meta); + return err; +} + +static void em_meta_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +{ + if (m) + meta_delete((struct meta_match *) m->data); +} + +static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em) +{ + struct meta_match *meta = (struct meta_match *) em->data; + struct tcf_meta_hdr hdr; + struct meta_type_ops *ops; + + memset(&hdr, 0, sizeof(hdr)); + memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left)); + memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right)); + + RTA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr); + + ops = meta_type_ops(&meta->lvalue); + if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 || + ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0) + goto rtattr_failure; + + return 0; + +rtattr_failure: + return -1; +} + +static struct tcf_ematch_ops em_meta_ops = { + .kind = TCF_EM_META, + .change = em_meta_change, + .match = em_meta_match, + .destroy = em_meta_destroy, + .dump = em_meta_dump, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_meta_ops.link) +}; + +static int __init init_em_meta(void) +{ + return tcf_em_register(&em_meta_ops); +} + +static void __exit exit_em_meta(void) +{ + tcf_em_unregister(&em_meta_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_meta); +module_exit(exit_em_meta); diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c new file mode 100644 index 00000000000..71ea926a9f0 --- /dev/null +++ b/net/sched/em_nbyte.c @@ -0,0 +1,82 @@ +/* + * net/sched/em_nbyte.c N-Byte ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/tc_ematch/tc_em_nbyte.h> +#include <net/pkt_cls.h> + +struct nbyte_data +{ + struct tcf_em_nbyte hdr; + char pattern[0]; +}; + +static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len, + struct tcf_ematch *em) +{ + struct tcf_em_nbyte *nbyte = data; + + if (data_len < sizeof(*nbyte) || + data_len < (sizeof(*nbyte) + nbyte->len)) + return -EINVAL; + + em->datalen = sizeof(*nbyte) + nbyte->len; + em->data = (unsigned long) kmalloc(em->datalen, GFP_KERNEL); + if (em->data == 0UL) + return -ENOBUFS; + + memcpy((void *) em->data, data, em->datalen); + + return 0; +} + +static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + struct nbyte_data *nbyte = (struct nbyte_data *) em->data; + unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer); + + ptr += nbyte->hdr.off; + + if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len)) + return 0; + + return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len); +} + +static struct tcf_ematch_ops em_nbyte_ops = { + .kind = TCF_EM_NBYTE, + .change = em_nbyte_change, + .match = em_nbyte_match, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_nbyte_ops.link) +}; + +static int __init init_em_nbyte(void) +{ + return tcf_em_register(&em_nbyte_ops); +} + +static void __exit exit_em_nbyte(void) +{ + tcf_em_unregister(&em_nbyte_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_nbyte); +module_exit(exit_em_nbyte); diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c new file mode 100644 index 00000000000..34e7e51e601 --- /dev/null +++ b/net/sched/em_u32.c @@ -0,0 +1,63 @@ +/* + * net/sched/em_u32.c U32 Ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Based on net/sched/cls_u32.c + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <net/pkt_cls.h> + +static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + struct tc_u32_key *key = (struct tc_u32_key *) em->data; + unsigned char *ptr = skb->nh.raw; + + if (info) { + if (info->ptr) + ptr = info->ptr; + ptr += (info->nexthdr & key->offmask); + } + + ptr += key->off; + + if (!tcf_valid_offset(skb, ptr, sizeof(u32))) + return 0; + + return !(((*(u32*) ptr) ^ key->val) & key->mask); +} + +static struct tcf_ematch_ops em_u32_ops = { + .kind = TCF_EM_U32, + .datalen = sizeof(struct tc_u32_key), + .match = em_u32_match, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_u32_ops.link) +}; + +static int __init init_em_u32(void) +{ + return tcf_em_register(&em_u32_ops); +} + +static void __exit exit_em_u32(void) +{ + tcf_em_unregister(&em_u32_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_u32); +module_exit(exit_em_u32); diff --git a/net/sched/ematch.c b/net/sched/ematch.c new file mode 100644 index 00000000000..ebfe2e7d21b --- /dev/null +++ b/net/sched/ematch.c @@ -0,0 +1,524 @@ +/* + * net/sched/ematch.c Extended Match API + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + * + * ========================================================================== + * + * An extended match (ematch) is a small classification tool not worth + * writing a full classifier for. Ematches can be interconnected to form + * a logic expression and get attached to classifiers to extend their + * functionatlity. + * + * The userspace part transforms the logic expressions into an array + * consisting of multiple sequences of interconnected ematches separated + * by markers. Precedence is implemented by a special ematch kind + * referencing a sequence beyond the marker of the current sequence + * causing the current position in the sequence to be pushed onto a stack + * to allow the current position to be overwritten by the position referenced + * in the special ematch. Matching continues in the new sequence until a + * marker is reached causing the position to be restored from the stack. + * + * Example: + * A AND (B1 OR B2) AND C AND D + * + * ------->-PUSH------- + * -->-- / -->-- \ -->-- + * / \ / / \ \ / \ + * +-------+-------+-------+-------+-------+--------+ + * | A AND | B AND | C AND | D END | B1 OR | B2 END | + * +-------+-------+-------+-------+-------+--------+ + * \ / + * --------<-POP--------- + * + * where B is a virtual ematch referencing to sequence starting with B1. + * + * ========================================================================== + * + * How to write an ematch in 60 seconds + * ------------------------------------ + * + * 1) Provide a matcher function: + * static int my_match(struct sk_buff *skb, struct tcf_ematch *m, + * struct tcf_pkt_info *info) + * { + * struct mydata *d = (struct mydata *) m->data; + * + * if (...matching goes here...) + * return 1; + * else + * return 0; + * } + * + * 2) Fill out a struct tcf_ematch_ops: + * static struct tcf_ematch_ops my_ops = { + * .kind = unique id, + * .datalen = sizeof(struct mydata), + * .match = my_match, + * .owner = THIS_MODULE, + * }; + * + * 3) Register/Unregister your ematch: + * static int __init init_my_ematch(void) + * { + * return tcf_em_register(&my_ops); + * } + * + * static void __exit exit_my_ematch(void) + * { + * return tcf_em_unregister(&my_ops); + * } + * + * module_init(init_my_ematch); + * module_exit(exit_my_ematch); + * + * 4) By now you should have two more seconds left, barely enough to + * open up a beer to watch the compilation going. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> +#include <net/pkt_cls.h> +#include <config/net/ematch/stack.h> + +static LIST_HEAD(ematch_ops); +static DEFINE_RWLOCK(ematch_mod_lock); + +static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind) +{ + struct tcf_ematch_ops *e = NULL; + + read_lock(&ematch_mod_lock); + list_for_each_entry(e, &ematch_ops, link) { + if (kind == e->kind) { + if (!try_module_get(e->owner)) + e = NULL; + read_unlock(&ematch_mod_lock); + return e; + } + } + read_unlock(&ematch_mod_lock); + + return NULL; +} + +/** + * tcf_em_register - register an extended match + * + * @ops: ematch operations lookup table + * + * This function must be called by ematches to announce their presence. + * The given @ops must have kind set to a unique identifier and the + * callback match() must be implemented. All other callbacks are optional + * and a fallback implementation is used instead. + * + * Returns -EEXISTS if an ematch of the same kind has already registered. + */ +int tcf_em_register(struct tcf_ematch_ops *ops) +{ + int err = -EEXIST; + struct tcf_ematch_ops *e; + + if (ops->match == NULL) + return -EINVAL; + + write_lock(&ematch_mod_lock); + list_for_each_entry(e, &ematch_ops, link) + if (ops->kind == e->kind) + goto errout; + + list_add_tail(&ops->link, &ematch_ops); + err = 0; +errout: + write_unlock(&ematch_mod_lock); + return err; +} + +/** + * tcf_em_unregister - unregster and extended match + * + * @ops: ematch operations lookup table + * + * This function must be called by ematches to announce their disappearance + * for examples when the module gets unloaded. The @ops parameter must be + * the same as the one used for registration. + * + * Returns -ENOENT if no matching ematch was found. + */ +int tcf_em_unregister(struct tcf_ematch_ops *ops) +{ + int err = 0; + struct tcf_ematch_ops *e; + + write_lock(&ematch_mod_lock); + list_for_each_entry(e, &ematch_ops, link) { + if (e == ops) { + list_del(&e->link); + goto out; + } + } + + err = -ENOENT; +out: + write_unlock(&ematch_mod_lock); + return err; +} + +static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree, + int index) +{ + return &tree->matches[index]; +} + + +static int tcf_em_validate(struct tcf_proto *tp, + struct tcf_ematch_tree_hdr *tree_hdr, + struct tcf_ematch *em, struct rtattr *rta, int idx) +{ + int err = -EINVAL; + struct tcf_ematch_hdr *em_hdr = RTA_DATA(rta); + int data_len = RTA_PAYLOAD(rta) - sizeof(*em_hdr); + void *data = (void *) em_hdr + sizeof(*em_hdr); + + if (!TCF_EM_REL_VALID(em_hdr->flags)) + goto errout; + + if (em_hdr->kind == TCF_EM_CONTAINER) { + /* Special ematch called "container", carries an index + * referencing an external ematch sequence. */ + u32 ref; + + if (data_len < sizeof(ref)) + goto errout; + ref = *(u32 *) data; + + if (ref >= tree_hdr->nmatches) + goto errout; + + /* We do not allow backward jumps to avoid loops and jumps + * to our own position are of course illegal. */ + if (ref <= idx) + goto errout; + + + em->data = ref; + } else { + /* Note: This lookup will increase the module refcnt + * of the ematch module referenced. In case of a failure, + * a destroy function is called by the underlying layer + * which automatically releases the reference again, therefore + * the module MUST not be given back under any circumstances + * here. Be aware, the destroy function assumes that the + * module is held if the ops field is non zero. */ + em->ops = tcf_em_lookup(em_hdr->kind); + + if (em->ops == NULL) { + err = -ENOENT; + goto errout; + } + + /* ematch module provides expected length of data, so we + * can do a basic sanity check. */ + if (em->ops->datalen && data_len < em->ops->datalen) + goto errout; + + if (em->ops->change) { + err = em->ops->change(tp, data, data_len, em); + if (err < 0) + goto errout; + } else if (data_len > 0) { + /* ematch module doesn't provide an own change + * procedure and expects us to allocate and copy + * the ematch data. + * + * TCF_EM_SIMPLE may be specified stating that the + * data only consists of a u32 integer and the module + * does not expected a memory reference but rather + * the value carried. */ + if (em_hdr->flags & TCF_EM_SIMPLE) { + if (data_len < sizeof(u32)) + goto errout; + em->data = *(u32 *) data; + } else { + void *v = kmalloc(data_len, GFP_KERNEL); + if (v == NULL) { + err = -ENOBUFS; + goto errout; + } + memcpy(v, data, data_len); + em->data = (unsigned long) v; + } + } + } + + em->matchid = em_hdr->matchid; + em->flags = em_hdr->flags; + em->datalen = data_len; + + err = 0; +errout: + return err; +} + +/** + * tcf_em_tree_validate - validate ematch config TLV and build ematch tree + * + * @tp: classifier kind handle + * @rta: ematch tree configuration TLV + * @tree: destination ematch tree variable to store the resulting + * ematch tree. + * + * This function validates the given configuration TLV @rta and builds an + * ematch tree in @tree. The resulting tree must later be copied into + * the private classifier data using tcf_em_tree_change(). You MUST NOT + * provide the ematch tree variable of the private classifier data directly, + * the changes would not be locked properly. + * + * Returns a negative error code if the configuration TLV contains errors. + */ +int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta, + struct tcf_ematch_tree *tree) +{ + int idx, list_len, matches_len, err = -EINVAL; + struct rtattr *tb[TCA_EMATCH_TREE_MAX]; + struct rtattr *rt_match, *rt_hdr, *rt_list; + struct tcf_ematch_tree_hdr *tree_hdr; + struct tcf_ematch *em; + + if (rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0) + goto errout; + + rt_hdr = tb[TCA_EMATCH_TREE_HDR-1]; + rt_list = tb[TCA_EMATCH_TREE_LIST-1]; + + if (rt_hdr == NULL || rt_list == NULL) + goto errout; + + if (RTA_PAYLOAD(rt_hdr) < sizeof(*tree_hdr) || + RTA_PAYLOAD(rt_list) < sizeof(*rt_match)) + goto errout; + + tree_hdr = RTA_DATA(rt_hdr); + memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr)); + + rt_match = RTA_DATA(rt_list); + list_len = RTA_PAYLOAD(rt_list); + matches_len = tree_hdr->nmatches * sizeof(*em); + + tree->matches = kmalloc(matches_len, GFP_KERNEL); + if (tree->matches == NULL) + goto errout; + memset(tree->matches, 0, matches_len); + + /* We do not use rtattr_parse_nested here because the maximum + * number of attributes is unknown. This saves us the allocation + * for a tb buffer which would serve no purpose at all. + * + * The array of rt attributes is parsed in the order as they are + * provided, their type must be incremental from 1 to n. Even + * if it does not serve any real purpose, a failure of sticking + * to this policy will result in parsing failure. */ + for (idx = 0; RTA_OK(rt_match, list_len); idx++) { + err = -EINVAL; + + if (rt_match->rta_type != (idx + 1)) + goto errout_abort; + + if (idx >= tree_hdr->nmatches) + goto errout_abort; + + if (RTA_PAYLOAD(rt_match) < sizeof(struct tcf_ematch_hdr)) + goto errout_abort; + + em = tcf_em_get_match(tree, idx); + + err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx); + if (err < 0) + goto errout_abort; + + rt_match = RTA_NEXT(rt_match, list_len); + } + + /* Check if the number of matches provided by userspace actually + * complies with the array of matches. The number was used for + * the validation of references and a mismatch could lead to + * undefined references during the matching process. */ + if (idx != tree_hdr->nmatches) { + err = -EINVAL; + goto errout_abort; + } + + err = 0; +errout: + return err; + +errout_abort: + tcf_em_tree_destroy(tp, tree); + return err; +} + +/** + * tcf_em_tree_destroy - destroy an ematch tree + * + * @tp: classifier kind handle + * @tree: ematch tree to be deleted + * + * This functions destroys an ematch tree previously created by + * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that + * the ematch tree is not in use before calling this function. + */ +void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) +{ + int i; + + if (tree->matches == NULL) + return; + + for (i = 0; i < tree->hdr.nmatches; i++) { + struct tcf_ematch *em = tcf_em_get_match(tree, i); + + if (em->ops) { + if (em->ops->destroy) + em->ops->destroy(tp, em); + else if (!tcf_em_is_simple(em) && em->data) + kfree((void *) em->data); + module_put(em->ops->owner); + } + } + + tree->hdr.nmatches = 0; + kfree(tree->matches); +} + +/** + * tcf_em_tree_dump - dump ematch tree into a rtnl message + * + * @skb: skb holding the rtnl message + * @t: ematch tree to be dumped + * @tlv: TLV type to be used to encapsulate the tree + * + * This function dumps a ematch tree into a rtnl message. It is valid to + * call this function while the ematch tree is in use. + * + * Returns -1 if the skb tailroom is insufficient. + */ +int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) +{ + int i; + struct rtattr * top_start = (struct rtattr*) skb->tail; + struct rtattr * list_start; + + RTA_PUT(skb, tlv, 0, NULL); + RTA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr); + + list_start = (struct rtattr *) skb->tail; + RTA_PUT(skb, TCA_EMATCH_TREE_LIST, 0, NULL); + + for (i = 0; i < tree->hdr.nmatches; i++) { + struct rtattr *match_start = (struct rtattr*) skb->tail; + struct tcf_ematch *em = tcf_em_get_match(tree, i); + struct tcf_ematch_hdr em_hdr = { + .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER, + .matchid = em->matchid, + .flags = em->flags + }; + + RTA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr); + + if (em->ops && em->ops->dump) { + if (em->ops->dump(skb, em) < 0) + goto rtattr_failure; + } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) { + u32 u = em->data; + RTA_PUT_NOHDR(skb, sizeof(u), &u); + } else if (em->datalen > 0) + RTA_PUT_NOHDR(skb, em->datalen, (void *) em->data); + + match_start->rta_len = skb->tail - (u8*) match_start; + } + + list_start->rta_len = skb->tail - (u8 *) list_start; + top_start->rta_len = skb->tail - (u8 *) top_start; + + return 0; + +rtattr_failure: + return -1; +} + +static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + int r = em->ops->match(skb, em, info); + return tcf_em_is_inverted(em) ? !r : r; +} + +/* Do not use this function directly, use tcf_em_tree_match instead */ +int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, + struct tcf_pkt_info *info) +{ + int stackp = 0, match_idx = 0, res = 0; + struct tcf_ematch *cur_match; + int stack[CONFIG_NET_EMATCH_STACK]; + +proceed: + while (match_idx < tree->hdr.nmatches) { + cur_match = tcf_em_get_match(tree, match_idx); + + if (tcf_em_is_container(cur_match)) { + if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK)) + goto stack_overflow; + + stack[stackp++] = match_idx; + match_idx = cur_match->data; + goto proceed; + } + + res = tcf_em_match(skb, cur_match, info); + + if (tcf_em_early_end(cur_match, res)) + break; + + match_idx++; + } + +pop_stack: + if (stackp > 0) { + match_idx = stack[--stackp]; + cur_match = tcf_em_get_match(tree, match_idx); + + if (tcf_em_early_end(cur_match, res)) + goto pop_stack; + else { + match_idx++; + goto proceed; + } + } + + return res; + +stack_overflow: + if (net_ratelimit()) + printk("Local stack overflow, increase NET_EMATCH_STACK\n"); + return -1; +} + +EXPORT_SYMBOL(tcf_em_register); +EXPORT_SYMBOL(tcf_em_unregister); +EXPORT_SYMBOL(tcf_em_tree_validate); +EXPORT_SYMBOL(tcf_em_tree_destroy); +EXPORT_SYMBOL(tcf_em_tree_dump); +EXPORT_SYMBOL(__tcf_em_tree_match); diff --git a/net/sched/estimator.c b/net/sched/estimator.c new file mode 100644 index 00000000000..5d3ae03e22a --- /dev/null +++ b/net/sched/estimator.c @@ -0,0 +1,197 @@ +/* + * net/sched/estimator.c Simple rate estimator. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* + This code is NOT intended to be used for statistics collection, + its purpose is to provide a base for statistical multiplexing + for controlled load service. + If you need only statistics, run a user level daemon which + periodically reads byte counters. + + Unfortunately, rate estimation is not a very easy task. + F.e. I did not find a simple way to estimate the current peak rate + and even failed to formulate the problem 8)8) + + So I preferred not to built an estimator into the scheduler, + but run this task separately. + Ideally, it should be kernel thread(s), but for now it runs + from timers, which puts apparent top bounds on the number of rated + flows, has minimal overhead on small, but is enough + to handle controlled load service, sets of aggregates. + + We measure rate over A=(1<<interval) seconds and evaluate EWMA: + + avrate = avrate*(1-W) + rate*W + + where W is chosen as negative power of 2: W = 2^(-ewma_log) + + The resulting time constant is: + + T = A/(-ln(1-W)) + + + NOTES. + + * The stored value for avbps is scaled by 2^5, so that maximal + rate is ~1Gbit, avpps is scaled by 2^10. + + * Minimal interval is HZ/4=250msec (it is the greatest common divisor + for HZ=100 and HZ=1024 8)), maximal interval + is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals + are too expensive, longer ones can be implemented + at user level painlessly. + */ + +#define EST_MAX_INTERVAL 5 + +struct qdisc_estimator +{ + struct qdisc_estimator *next; + struct tc_stats *stats; + spinlock_t *stats_lock; + unsigned interval; + int ewma_log; + u64 last_bytes; + u32 last_packets; + u32 avpps; + u32 avbps; +}; + +struct qdisc_estimator_head +{ + struct timer_list timer; + struct qdisc_estimator *list; +}; + +static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1]; + +/* Estimator array lock */ +static DEFINE_RWLOCK(est_lock); + +static void est_timer(unsigned long arg) +{ + int idx = (int)arg; + struct qdisc_estimator *e; + + read_lock(&est_lock); + for (e = elist[idx].list; e; e = e->next) { + struct tc_stats *st = e->stats; + u64 nbytes; + u32 npackets; + u32 rate; + + spin_lock(e->stats_lock); + nbytes = st->bytes; + npackets = st->packets; + rate = (nbytes - e->last_bytes)<<(7 - idx); + e->last_bytes = nbytes; + e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; + st->bps = (e->avbps+0xF)>>5; + + rate = (npackets - e->last_packets)<<(12 - idx); + e->last_packets = npackets; + e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; + e->stats->pps = (e->avpps+0x1FF)>>10; + spin_unlock(e->stats_lock); + } + + mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4)); + read_unlock(&est_lock); +} + +int qdisc_new_estimator(struct tc_stats *stats, spinlock_t *stats_lock, struct rtattr *opt) +{ + struct qdisc_estimator *est; + struct tc_estimator *parm = RTA_DATA(opt); + + if (RTA_PAYLOAD(opt) < sizeof(*parm)) + return -EINVAL; + + if (parm->interval < -2 || parm->interval > 3) + return -EINVAL; + + est = kmalloc(sizeof(*est), GFP_KERNEL); + if (est == NULL) + return -ENOBUFS; + + memset(est, 0, sizeof(*est)); + est->interval = parm->interval + 2; + est->stats = stats; + est->stats_lock = stats_lock; + est->ewma_log = parm->ewma_log; + est->last_bytes = stats->bytes; + est->avbps = stats->bps<<5; + est->last_packets = stats->packets; + est->avpps = stats->pps<<10; + + est->next = elist[est->interval].list; + if (est->next == NULL) { + init_timer(&elist[est->interval].timer); + elist[est->interval].timer.data = est->interval; + elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4); + elist[est->interval].timer.function = est_timer; + add_timer(&elist[est->interval].timer); + } + write_lock_bh(&est_lock); + elist[est->interval].list = est; + write_unlock_bh(&est_lock); + return 0; +} + +void qdisc_kill_estimator(struct tc_stats *stats) +{ + int idx; + struct qdisc_estimator *est, **pest; + + for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { + int killed = 0; + pest = &elist[idx].list; + while ((est=*pest) != NULL) { + if (est->stats != stats) { + pest = &est->next; + continue; + } + + write_lock_bh(&est_lock); + *pest = est->next; + write_unlock_bh(&est_lock); + + kfree(est); + killed++; + } + if (killed && elist[idx].list == NULL) + del_timer(&elist[idx].timer); + } +} + +EXPORT_SYMBOL(qdisc_kill_estimator); +EXPORT_SYMBOL(qdisc_new_estimator); diff --git a/net/sched/gact.c b/net/sched/gact.c new file mode 100644 index 00000000000..a811c89fef7 --- /dev/null +++ b/net/sched/gact.c @@ -0,0 +1,231 @@ +/* + * net/sched/gact.c Generic actions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * copyright Jamal Hadi Salim (2002-4) + * + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <linux/tc_act/tc_gact.h> +#include <net/tc_act/tc_gact.h> + +/* use generic hash table */ +#define MY_TAB_SIZE 16 +#define MY_TAB_MASK 15 + +static u32 idx_gen; +static struct tcf_gact *tcf_gact_ht[MY_TAB_SIZE]; +static DEFINE_RWLOCK(gact_lock); + +/* ovewrride the defaults */ +#define tcf_st tcf_gact +#define tc_st tc_gact +#define tcf_t_lock gact_lock +#define tcf_ht tcf_gact_ht + +#define CONFIG_NET_ACT_INIT 1 +#include <net/pkt_act.h> + +#ifdef CONFIG_GACT_PROB +static int gact_net_rand(struct tcf_gact *p) +{ + if (net_random()%p->pval) + return p->action; + return p->paction; +} + +static int gact_determ(struct tcf_gact *p) +{ + if (p->bstats.packets%p->pval) + return p->action; + return p->paction; +} + +typedef int (*g_rand)(struct tcf_gact *p); +static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ }; +#endif + +static int tcf_gact_init(struct rtattr *rta, struct rtattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct rtattr *tb[TCA_GACT_MAX]; + struct tc_gact *parm; + struct tcf_gact *p; + int ret = 0; + + if (rta == NULL || rtattr_parse_nested(tb, TCA_GACT_MAX, rta) < 0) + return -EINVAL; + + if (tb[TCA_GACT_PARMS - 1] == NULL || + RTA_PAYLOAD(tb[TCA_GACT_PARMS - 1]) < sizeof(*parm)) + return -EINVAL; + parm = RTA_DATA(tb[TCA_GACT_PARMS - 1]); + + if (tb[TCA_GACT_PROB-1] != NULL) +#ifdef CONFIG_GACT_PROB + if (RTA_PAYLOAD(tb[TCA_GACT_PROB-1]) < sizeof(struct tc_gact_p)) + return -EINVAL; +#else + return -EOPNOTSUPP; +#endif + + p = tcf_hash_check(parm->index, a, ovr, bind); + if (p == NULL) { + p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind); + if (p == NULL) + return -ENOMEM; + ret = ACT_P_CREATED; + } else { + if (!ovr) { + tcf_hash_release(p, bind); + return -EEXIST; + } + } + + spin_lock_bh(&p->lock); + p->action = parm->action; +#ifdef CONFIG_GACT_PROB + if (tb[TCA_GACT_PROB-1] != NULL) { + struct tc_gact_p *p_parm = RTA_DATA(tb[TCA_GACT_PROB-1]); + p->paction = p_parm->paction; + p->pval = p_parm->pval; + p->ptype = p_parm->ptype; + } +#endif + spin_unlock_bh(&p->lock); + if (ret == ACT_P_CREATED) + tcf_hash_insert(p); + return ret; +} + +static int +tcf_gact_cleanup(struct tc_action *a, int bind) +{ + struct tcf_gact *p = PRIV(a, gact); + + if (p != NULL) + return tcf_hash_release(p, bind); + return 0; +} + +static int +tcf_gact(struct sk_buff **pskb, struct tc_action *a) +{ + struct tcf_gact *p = PRIV(a, gact); + struct sk_buff *skb = *pskb; + int action = TC_ACT_SHOT; + + spin_lock(&p->lock); +#ifdef CONFIG_GACT_PROB + if (p->ptype && gact_rand[p->ptype] != NULL) + action = gact_rand[p->ptype](p); + else + action = p->action; +#else + action = p->action; +#endif + p->bstats.bytes += skb->len; + p->bstats.packets++; + if (action == TC_ACT_SHOT) + p->qstats.drops++; + p->tm.lastuse = jiffies; + spin_unlock(&p->lock); + + return action; +} + +static int +tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb->tail; + struct tc_gact opt; + struct tcf_gact *p = PRIV(a, gact); + struct tcf_t t; + + opt.index = p->index; + opt.refcnt = p->refcnt - ref; + opt.bindcnt = p->bindcnt - bind; + opt.action = p->action; + RTA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt); +#ifdef CONFIG_GACT_PROB + if (p->ptype) { + struct tc_gact_p p_opt; + p_opt.paction = p->paction; + p_opt.pval = p->pval; + p_opt.ptype = p->ptype; + RTA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt); + } +#endif + t.install = jiffies_to_clock_t(jiffies - p->tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); + t.expires = jiffies_to_clock_t(p->tm.expires); + RTA_PUT(skb, TCA_GACT_TM, sizeof(t), &t); + return skb->len; + + rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tc_action_ops act_gact_ops = { + .kind = "gact", + .type = TCA_ACT_GACT, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_gact, + .dump = tcf_gact_dump, + .cleanup = tcf_gact_cleanup, + .lookup = tcf_hash_search, + .init = tcf_gact_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); +MODULE_DESCRIPTION("Generic Classifier actions"); +MODULE_LICENSE("GPL"); + +static int __init +gact_init_module(void) +{ +#ifdef CONFIG_GACT_PROB + printk("GACT probability on\n"); +#else + printk("GACT probability NOT on\n"); +#endif + return tcf_register_action(&act_gact_ops); +} + +static void __exit +gact_cleanup_module(void) +{ + tcf_unregister_action(&act_gact_ops); +} + +module_init(gact_init_module); +module_exit(gact_cleanup_module); diff --git a/net/sched/ipt.c b/net/sched/ipt.c new file mode 100644 index 00000000000..b114d994d52 --- /dev/null +++ b/net/sched/ipt.c @@ -0,0 +1,326 @@ +/* + * net/sched/ipt.c iptables target interface + * + *TODO: Add other tables. For now we only support the ipv4 table targets + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Copyright: Jamal Hadi Salim (2002-4) + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/kmod.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <linux/tc_act/tc_ipt.h> +#include <net/tc_act/tc_ipt.h> + +#include <linux/netfilter_ipv4/ip_tables.h> + +/* use generic hash table */ +#define MY_TAB_SIZE 16 +#define MY_TAB_MASK 15 + +static u32 idx_gen; +static struct tcf_ipt *tcf_ipt_ht[MY_TAB_SIZE]; +/* ipt hash table lock */ +static DEFINE_RWLOCK(ipt_lock); + +/* ovewrride the defaults */ +#define tcf_st tcf_ipt +#define tcf_t_lock ipt_lock +#define tcf_ht tcf_ipt_ht + +#define CONFIG_NET_ACT_INIT +#include <net/pkt_act.h> + +static int +ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook) +{ + struct ipt_target *target; + int ret = 0; + + target = ipt_find_target(t->u.user.name, t->u.user.revision); + if (!target) + return -ENOENT; + + DPRINTK("ipt_init_target: found %s\n", target->name); + t->u.kernel.target = target; + + if (t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(table, NULL, t->data, + t->u.target_size - sizeof(*t), + hook)) { + DPRINTK("ipt_init_target: check failed for `%s'.\n", + t->u.kernel.target->name); + module_put(t->u.kernel.target->me); + ret = -EINVAL; + } + + return ret; +} + +static void +ipt_destroy_target(struct ipt_entry_target *t) +{ + if (t->u.kernel.target->destroy) + t->u.kernel.target->destroy(t->data, + t->u.target_size - sizeof(*t)); + module_put(t->u.kernel.target->me); +} + +static int +tcf_ipt_release(struct tcf_ipt *p, int bind) +{ + int ret = 0; + if (p) { + if (bind) + p->bindcnt--; + p->refcnt--; + if (p->bindcnt <= 0 && p->refcnt <= 0) { + ipt_destroy_target(p->t); + kfree(p->tname); + kfree(p->t); + tcf_hash_destroy(p); + ret = ACT_P_DELETED; + } + } + return ret; +} + +static int +tcf_ipt_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a, + int ovr, int bind) +{ + struct rtattr *tb[TCA_IPT_MAX]; + struct tcf_ipt *p; + struct ipt_entry_target *td, *t; + char *tname; + int ret = 0, err; + u32 hook = 0; + u32 index = 0; + + if (rta == NULL || rtattr_parse_nested(tb, TCA_IPT_MAX, rta) < 0) + return -EINVAL; + + if (tb[TCA_IPT_HOOK-1] == NULL || + RTA_PAYLOAD(tb[TCA_IPT_HOOK-1]) < sizeof(u32)) + return -EINVAL; + if (tb[TCA_IPT_TARG-1] == NULL || + RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < sizeof(*t)) + return -EINVAL; + td = (struct ipt_entry_target *)RTA_DATA(tb[TCA_IPT_TARG-1]); + if (RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < td->u.target_size) + return -EINVAL; + + if (tb[TCA_IPT_INDEX-1] != NULL && + RTA_PAYLOAD(tb[TCA_IPT_INDEX-1]) >= sizeof(u32)) + index = *(u32 *)RTA_DATA(tb[TCA_IPT_INDEX-1]); + + p = tcf_hash_check(index, a, ovr, bind); + if (p == NULL) { + p = tcf_hash_create(index, est, a, sizeof(*p), ovr, bind); + if (p == NULL) + return -ENOMEM; + ret = ACT_P_CREATED; + } else { + if (!ovr) { + tcf_ipt_release(p, bind); + return -EEXIST; + } + } + + hook = *(u32 *)RTA_DATA(tb[TCA_IPT_HOOK-1]); + + err = -ENOMEM; + tname = kmalloc(IFNAMSIZ, GFP_KERNEL); + if (tname == NULL) + goto err1; + if (tb[TCA_IPT_TABLE - 1] == NULL || + rtattr_strlcpy(tname, tb[TCA_IPT_TABLE-1], IFNAMSIZ) >= IFNAMSIZ) + strcpy(tname, "mangle"); + + t = kmalloc(td->u.target_size, GFP_KERNEL); + if (t == NULL) + goto err2; + memcpy(t, td, td->u.target_size); + + if ((err = ipt_init_target(t, tname, hook)) < 0) + goto err3; + + spin_lock_bh(&p->lock); + if (ret != ACT_P_CREATED) { + ipt_destroy_target(p->t); + kfree(p->tname); + kfree(p->t); + } + p->tname = tname; + p->t = t; + p->hook = hook; + spin_unlock_bh(&p->lock); + if (ret == ACT_P_CREATED) + tcf_hash_insert(p); + return ret; + +err3: + kfree(t); +err2: + kfree(tname); +err1: + kfree(p); + return err; +} + +static int +tcf_ipt_cleanup(struct tc_action *a, int bind) +{ + struct tcf_ipt *p = PRIV(a, ipt); + return tcf_ipt_release(p, bind); +} + +static int +tcf_ipt(struct sk_buff **pskb, struct tc_action *a) +{ + int ret = 0, result = 0; + struct tcf_ipt *p = PRIV(a, ipt); + struct sk_buff *skb = *pskb; + + if (skb_cloned(skb)) { + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return TC_ACT_UNSPEC; + } + + spin_lock(&p->lock); + + p->tm.lastuse = jiffies; + p->bstats.bytes += skb->len; + p->bstats.packets++; + + /* yes, we have to worry about both in and out dev + worry later - danger - this API seems to have changed + from earlier kernels */ + + ret = p->t->u.kernel.target->target(&skb, skb->dev, NULL, + p->hook, p->t->data, NULL); + switch (ret) { + case NF_ACCEPT: + result = TC_ACT_OK; + break; + case NF_DROP: + result = TC_ACT_SHOT; + p->qstats.drops++; + break; + case IPT_CONTINUE: + result = TC_ACT_PIPE; + break; + default: + if (net_ratelimit()) + printk("Bogus netfilter code %d assume ACCEPT\n", ret); + result = TC_POLICE_OK; + break; + } + spin_unlock(&p->lock); + return result; + +} + +static int +tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + struct ipt_entry_target *t; + struct tcf_t tm; + struct tc_cnt c; + unsigned char *b = skb->tail; + struct tcf_ipt *p = PRIV(a, ipt); + + /* for simple targets kernel size == user size + ** user name = target name + ** for foolproof you need to not assume this + */ + + t = kmalloc(p->t->u.user.target_size, GFP_ATOMIC); + if (t == NULL) + goto rtattr_failure; + + c.bindcnt = p->bindcnt - bind; + c.refcnt = p->refcnt - ref; + memcpy(t, p->t, p->t->u.user.target_size); + strcpy(t->u.user.name, p->t->u.kernel.target->name); + + DPRINTK("\ttcf_ipt_dump tablename %s length %d\n", p->tname, + strlen(p->tname)); + DPRINTK("\tdump target name %s size %d size user %d " + "data[0] %x data[1] %x\n", p->t->u.kernel.target->name, + p->t->u.target_size, p->t->u.user.target_size, + p->t->data[0], p->t->data[1]); + RTA_PUT(skb, TCA_IPT_TARG, p->t->u.user.target_size, t); + RTA_PUT(skb, TCA_IPT_INDEX, 4, &p->index); + RTA_PUT(skb, TCA_IPT_HOOK, 4, &p->hook); + RTA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c); + RTA_PUT(skb, TCA_IPT_TABLE, IFNAMSIZ, p->tname); + tm.install = jiffies_to_clock_t(jiffies - p->tm.install); + tm.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); + tm.expires = jiffies_to_clock_t(p->tm.expires); + RTA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm); + kfree(t); + return skb->len; + + rtattr_failure: + skb_trim(skb, b - skb->data); + kfree(t); + return -1; +} + +static struct tc_action_ops act_ipt_ops = { + .kind = "ipt", + .type = TCA_ACT_IPT, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_ipt, + .dump = tcf_ipt_dump, + .cleanup = tcf_ipt_cleanup, + .lookup = tcf_hash_search, + .init = tcf_ipt_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); +MODULE_DESCRIPTION("Iptables target actions"); +MODULE_LICENSE("GPL"); + +static int __init +ipt_init_module(void) +{ + return tcf_register_action(&act_ipt_ops); +} + +static void __exit +ipt_cleanup_module(void) +{ + tcf_unregister_action(&act_ipt_ops); +} + +module_init(ipt_init_module); +module_exit(ipt_cleanup_module); diff --git a/net/sched/mirred.c b/net/sched/mirred.c new file mode 100644 index 00000000000..f309ce33680 --- /dev/null +++ b/net/sched/mirred.c @@ -0,0 +1,276 @@ +/* + * net/sched/mirred.c packet mirroring and redirect actions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Jamal Hadi Salim (2002-4) + * + * TODO: Add ingress support (and socket redirect support) + * + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <linux/tc_act/tc_mirred.h> +#include <net/tc_act/tc_mirred.h> + +#include <linux/etherdevice.h> +#include <linux/if_arp.h> + + +/* use generic hash table */ +#define MY_TAB_SIZE 8 +#define MY_TAB_MASK (MY_TAB_SIZE - 1) +static u32 idx_gen; +static struct tcf_mirred *tcf_mirred_ht[MY_TAB_SIZE]; +static DEFINE_RWLOCK(mirred_lock); + +/* ovewrride the defaults */ +#define tcf_st tcf_mirred +#define tc_st tc_mirred +#define tcf_t_lock mirred_lock +#define tcf_ht tcf_mirred_ht + +#define CONFIG_NET_ACT_INIT 1 +#include <net/pkt_act.h> + +static inline int +tcf_mirred_release(struct tcf_mirred *p, int bind) +{ + if (p) { + if (bind) + p->bindcnt--; + p->refcnt--; + if(!p->bindcnt && p->refcnt <= 0) { + dev_put(p->dev); + tcf_hash_destroy(p); + return 1; + } + } + return 0; +} + +static int +tcf_mirred_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a, + int ovr, int bind) +{ + struct rtattr *tb[TCA_MIRRED_MAX]; + struct tc_mirred *parm; + struct tcf_mirred *p; + struct net_device *dev = NULL; + int ret = 0; + int ok_push = 0; + + if (rta == NULL || rtattr_parse_nested(tb, TCA_MIRRED_MAX, rta) < 0) + return -EINVAL; + + if (tb[TCA_MIRRED_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_MIRRED_PARMS-1]) < sizeof(*parm)) + return -EINVAL; + parm = RTA_DATA(tb[TCA_MIRRED_PARMS-1]); + + if (parm->ifindex) { + dev = __dev_get_by_index(parm->ifindex); + if (dev == NULL) + return -ENODEV; + switch (dev->type) { + case ARPHRD_TUNNEL: + case ARPHRD_TUNNEL6: + case ARPHRD_SIT: + case ARPHRD_IPGRE: + case ARPHRD_VOID: + case ARPHRD_NONE: + ok_push = 0; + break; + default: + ok_push = 1; + break; + } + } + + p = tcf_hash_check(parm->index, a, ovr, bind); + if (p == NULL) { + if (!parm->ifindex) + return -EINVAL; + p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind); + if (p == NULL) + return -ENOMEM; + ret = ACT_P_CREATED; + } else { + if (!ovr) { + tcf_mirred_release(p, bind); + return -EEXIST; + } + } + + spin_lock_bh(&p->lock); + p->action = parm->action; + p->eaction = parm->eaction; + if (parm->ifindex) { + p->ifindex = parm->ifindex; + if (ret != ACT_P_CREATED) + dev_put(p->dev); + p->dev = dev; + dev_hold(dev); + p->ok_push = ok_push; + } + spin_unlock_bh(&p->lock); + if (ret == ACT_P_CREATED) + tcf_hash_insert(p); + + DPRINTK("tcf_mirred_init index %d action %d eaction %d device %s " + "ifindex %d\n", parm->index, parm->action, parm->eaction, + dev->name, parm->ifindex); + return ret; +} + +static int +tcf_mirred_cleanup(struct tc_action *a, int bind) +{ + struct tcf_mirred *p = PRIV(a, mirred); + + if (p != NULL) + return tcf_mirred_release(p, bind); + return 0; +} + +static int +tcf_mirred(struct sk_buff **pskb, struct tc_action *a) +{ + struct tcf_mirred *p = PRIV(a, mirred); + struct net_device *dev; + struct sk_buff *skb2 = NULL; + struct sk_buff *skb = *pskb; + u32 at = G_TC_AT(skb->tc_verd); + + spin_lock(&p->lock); + + dev = p->dev; + p->tm.lastuse = jiffies; + + if (!(dev->flags&IFF_UP) ) { + if (net_ratelimit()) + printk("mirred to Houston: device %s is gone!\n", + dev->name); +bad_mirred: + if (skb2 != NULL) + kfree_skb(skb2); + p->qstats.overlimits++; + p->bstats.bytes += skb->len; + p->bstats.packets++; + spin_unlock(&p->lock); + /* should we be asking for packet to be dropped? + * may make sense for redirect case only + */ + return TC_ACT_SHOT; + } + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + goto bad_mirred; + if (p->eaction != TCA_EGRESS_MIRROR && p->eaction != TCA_EGRESS_REDIR) { + if (net_ratelimit()) + printk("tcf_mirred unknown action %d\n", p->eaction); + goto bad_mirred; + } + + p->bstats.bytes += skb2->len; + p->bstats.packets++; + if (!(at & AT_EGRESS)) + if (p->ok_push) + skb_push(skb2, skb2->dev->hard_header_len); + + /* mirror is always swallowed */ + if (p->eaction != TCA_EGRESS_MIRROR) + skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at); + + skb2->dev = dev; + skb2->input_dev = skb->dev; + dev_queue_xmit(skb2); + spin_unlock(&p->lock); + return p->action; +} + +static int +tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb->tail; + struct tc_mirred opt; + struct tcf_mirred *p = PRIV(a, mirred); + struct tcf_t t; + + opt.index = p->index; + opt.action = p->action; + opt.refcnt = p->refcnt - ref; + opt.bindcnt = p->bindcnt - bind; + opt.eaction = p->eaction; + opt.ifindex = p->ifindex; + DPRINTK("tcf_mirred_dump index %d action %d eaction %d ifindex %d\n", + p->index, p->action, p->eaction, p->ifindex); + RTA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt); + t.install = jiffies_to_clock_t(jiffies - p->tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); + t.expires = jiffies_to_clock_t(p->tm.expires); + RTA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t); + return skb->len; + + rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tc_action_ops act_mirred_ops = { + .kind = "mirred", + .type = TCA_ACT_MIRRED, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_mirred, + .dump = tcf_mirred_dump, + .cleanup = tcf_mirred_cleanup, + .lookup = tcf_hash_search, + .init = tcf_mirred_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2002)"); +MODULE_DESCRIPTION("Device Mirror/redirect actions"); +MODULE_LICENSE("GPL"); + +static int __init +mirred_init_module(void) +{ + printk("Mirror/redirect action on\n"); + return tcf_register_action(&act_mirred_ops); +} + +static void __exit +mirred_cleanup_module(void) +{ + tcf_unregister_action(&act_mirred_ops); +} + +module_init(mirred_init_module); +module_exit(mirred_cleanup_module); diff --git a/net/sched/pedit.c b/net/sched/pedit.c new file mode 100644 index 00000000000..678be6a645f --- /dev/null +++ b/net/sched/pedit.c @@ -0,0 +1,288 @@ +/* + * net/sched/pedit.c Generic packet editor + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Jamal Hadi Salim (2002-4) + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <linux/tc_act/tc_pedit.h> +#include <net/tc_act/tc_pedit.h> + + +#define PEDIT_DEB 1 + +/* use generic hash table */ +#define MY_TAB_SIZE 16 +#define MY_TAB_MASK 15 +static u32 idx_gen; +static struct tcf_pedit *tcf_pedit_ht[MY_TAB_SIZE]; +static DEFINE_RWLOCK(pedit_lock); + +#define tcf_st tcf_pedit +#define tc_st tc_pedit +#define tcf_t_lock pedit_lock +#define tcf_ht tcf_pedit_ht + +#define CONFIG_NET_ACT_INIT 1 +#include <net/pkt_act.h> + +static int +tcf_pedit_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a, + int ovr, int bind) +{ + struct rtattr *tb[TCA_PEDIT_MAX]; + struct tc_pedit *parm; + int ret = 0; + struct tcf_pedit *p; + struct tc_pedit_key *keys = NULL; + int ksize; + + if (rta == NULL || rtattr_parse_nested(tb, TCA_PEDIT_MAX, rta) < 0) + return -EINVAL; + + if (tb[TCA_PEDIT_PARMS - 1] == NULL || + RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm)) + return -EINVAL; + parm = RTA_DATA(tb[TCA_PEDIT_PARMS-1]); + ksize = parm->nkeys * sizeof(struct tc_pedit_key); + if (RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm) + ksize) + return -EINVAL; + + p = tcf_hash_check(parm->index, a, ovr, bind); + if (p == NULL) { + if (!parm->nkeys) + return -EINVAL; + p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind); + if (p == NULL) + return -ENOMEM; + keys = kmalloc(ksize, GFP_KERNEL); + if (keys == NULL) { + kfree(p); + return -ENOMEM; + } + ret = ACT_P_CREATED; + } else { + if (!ovr) { + tcf_hash_release(p, bind); + return -EEXIST; + } + if (p->nkeys && p->nkeys != parm->nkeys) { + keys = kmalloc(ksize, GFP_KERNEL); + if (keys == NULL) + return -ENOMEM; + } + } + + spin_lock_bh(&p->lock); + p->flags = parm->flags; + p->action = parm->action; + if (keys) { + kfree(p->keys); + p->keys = keys; + p->nkeys = parm->nkeys; + } + memcpy(p->keys, parm->keys, ksize); + spin_unlock_bh(&p->lock); + if (ret == ACT_P_CREATED) + tcf_hash_insert(p); + return ret; +} + +static int +tcf_pedit_cleanup(struct tc_action *a, int bind) +{ + struct tcf_pedit *p = PRIV(a, pedit); + + if (p != NULL) { + struct tc_pedit_key *keys = p->keys; + if (tcf_hash_release(p, bind)) { + kfree(keys); + return 1; + } + } + return 0; +} + +static int +tcf_pedit(struct sk_buff **pskb, struct tc_action *a) +{ + struct tcf_pedit *p = PRIV(a, pedit); + struct sk_buff *skb = *pskb; + int i, munged = 0; + u8 *pptr; + + if (!(skb->tc_verd & TC_OK2MUNGE)) { + /* should we set skb->cloned? */ + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { + return p->action; + } + } + + pptr = skb->nh.raw; + + spin_lock(&p->lock); + + p->tm.lastuse = jiffies; + + if (p->nkeys > 0) { + struct tc_pedit_key *tkey = p->keys; + + for (i = p->nkeys; i > 0; i--, tkey++) { + u32 *ptr; + int offset = tkey->off; + + if (tkey->offmask) { + if (skb->len > tkey->at) { + char *j = pptr + tkey->at; + offset += ((*j & tkey->offmask) >> + tkey->shift); + } else { + goto bad; + } + } + + if (offset % 4) { + printk("offset must be on 32 bit boundaries\n"); + goto bad; + } + if (skb->len < 0 || (offset > 0 && offset > skb->len)) { + printk("offset %d cant exceed pkt length %d\n", + offset, skb->len); + goto bad; + } + + ptr = (u32 *)(pptr+offset); + /* just do it, baby */ + *ptr = ((*ptr & tkey->mask) ^ tkey->val); + munged++; + } + + if (munged) + skb->tc_verd = SET_TC_MUNGED(skb->tc_verd); + goto done; + } else { + printk("pedit BUG: index %d\n",p->index); + } + +bad: + p->qstats.overlimits++; +done: + p->bstats.bytes += skb->len; + p->bstats.packets++; + spin_unlock(&p->lock); + return p->action; +} + +static int +tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,int bind, int ref) +{ + unsigned char *b = skb->tail; + struct tc_pedit *opt; + struct tcf_pedit *p = PRIV(a, pedit); + struct tcf_t t; + int s; + + s = sizeof(*opt) + p->nkeys * sizeof(struct tc_pedit_key); + + /* netlink spinlocks held above us - must use ATOMIC */ + opt = kmalloc(s, GFP_ATOMIC); + if (opt == NULL) + return -ENOBUFS; + memset(opt, 0, s); + + memcpy(opt->keys, p->keys, p->nkeys * sizeof(struct tc_pedit_key)); + opt->index = p->index; + opt->nkeys = p->nkeys; + opt->flags = p->flags; + opt->action = p->action; + opt->refcnt = p->refcnt - ref; + opt->bindcnt = p->bindcnt - bind; + + +#ifdef PEDIT_DEB + { + /* Debug - get rid of later */ + int i; + struct tc_pedit_key *key = opt->keys; + + for (i=0; i<opt->nkeys; i++, key++) { + printk( "\n key #%d",i); + printk( " at %d: val %08x mask %08x", + (unsigned int)key->off, + (unsigned int)key->val, + (unsigned int)key->mask); + } + } +#endif + + RTA_PUT(skb, TCA_PEDIT_PARMS, s, opt); + t.install = jiffies_to_clock_t(jiffies - p->tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); + t.expires = jiffies_to_clock_t(p->tm.expires); + RTA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static +struct tc_action_ops act_pedit_ops = { + .kind = "pedit", + .type = TCA_ACT_PEDIT, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_pedit, + .dump = tcf_pedit_dump, + .cleanup = tcf_pedit_cleanup, + .lookup = tcf_hash_search, + .init = tcf_pedit_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); +MODULE_DESCRIPTION("Generic Packet Editor actions"); +MODULE_LICENSE("GPL"); + +static int __init +pedit_init_module(void) +{ + return tcf_register_action(&act_pedit_ops); +} + +static void __exit +pedit_cleanup_module(void) +{ + tcf_unregister_action(&act_pedit_ops); +} + +module_init(pedit_init_module); +module_exit(pedit_cleanup_module); + diff --git a/net/sched/police.c b/net/sched/police.c new file mode 100644 index 00000000000..c03545faf52 --- /dev/null +++ b/net/sched/police.c @@ -0,0 +1,612 @@ +/* + * net/sched/police.c Input police filter. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * J Hadi Salim (action changes) + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/module.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <net/sock.h> +#include <net/act_api.h> + +#define L2T(p,L) ((p)->R_tab->data[(L)>>(p)->R_tab->rate.cell_log]) +#define L2T_P(p,L) ((p)->P_tab->data[(L)>>(p)->P_tab->rate.cell_log]) +#define PRIV(a) ((struct tcf_police *) (a)->priv) + +/* use generic hash table */ +#define MY_TAB_SIZE 16 +#define MY_TAB_MASK 15 +static u32 idx_gen; +static struct tcf_police *tcf_police_ht[MY_TAB_SIZE]; +/* Policer hash table lock */ +static DEFINE_RWLOCK(police_lock); + +/* Each policer is serialized by its individual spinlock */ + +static __inline__ unsigned tcf_police_hash(u32 index) +{ + return index&0xF; +} + +static __inline__ struct tcf_police * tcf_police_lookup(u32 index) +{ + struct tcf_police *p; + + read_lock(&police_lock); + for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) { + if (p->index == index) + break; + } + read_unlock(&police_lock); + return p; +} + +#ifdef CONFIG_NET_CLS_ACT +static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, + int type, struct tc_action *a) +{ + struct tcf_police *p; + int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; + struct rtattr *r; + + read_lock(&police_lock); + + s_i = cb->args[0]; + + for (i = 0; i < MY_TAB_SIZE; i++) { + p = tcf_police_ht[tcf_police_hash(i)]; + + for (; p; p = p->next) { + index++; + if (index < s_i) + continue; + a->priv = p; + a->order = index; + r = (struct rtattr*) skb->tail; + RTA_PUT(skb, a->order, 0, NULL); + if (type == RTM_DELACTION) + err = tcf_action_dump_1(skb, a, 0, 1); + else + err = tcf_action_dump_1(skb, a, 0, 0); + if (err < 0) { + index--; + skb_trim(skb, (u8*)r - skb->data); + goto done; + } + r->rta_len = skb->tail - (u8*)r; + n_i++; + } + } +done: + read_unlock(&police_lock); + if (n_i) + cb->args[0] += n_i; + return n_i; + +rtattr_failure: + skb_trim(skb, (u8*)r - skb->data); + goto done; +} + +static inline int +tcf_hash_search(struct tc_action *a, u32 index) +{ + struct tcf_police *p = tcf_police_lookup(index); + + if (p != NULL) { + a->priv = p; + return 1; + } else { + return 0; + } +} +#endif + +static inline u32 tcf_police_new_index(void) +{ + do { + if (++idx_gen == 0) + idx_gen = 1; + } while (tcf_police_lookup(idx_gen)); + + return idx_gen; +} + +void tcf_police_destroy(struct tcf_police *p) +{ + unsigned h = tcf_police_hash(p->index); + struct tcf_police **p1p; + + for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) { + if (*p1p == p) { + write_lock_bh(&police_lock); + *p1p = p->next; + write_unlock_bh(&police_lock); +#ifdef CONFIG_NET_ESTIMATOR + gen_kill_estimator(&p->bstats, &p->rate_est); +#endif + if (p->R_tab) + qdisc_put_rtab(p->R_tab); + if (p->P_tab) + qdisc_put_rtab(p->P_tab); + kfree(p); + return; + } + } + BUG_TRAP(0); +} + +#ifdef CONFIG_NET_CLS_ACT +static int tcf_act_police_locate(struct rtattr *rta, struct rtattr *est, + struct tc_action *a, int ovr, int bind) +{ + unsigned h; + int ret = 0, err; + struct rtattr *tb[TCA_POLICE_MAX]; + struct tc_police *parm; + struct tcf_police *p; + struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; + + if (rta == NULL || rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0) + return -EINVAL; + + if (tb[TCA_POLICE_TBF-1] == NULL || + RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm)) + return -EINVAL; + parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); + + if (tb[TCA_POLICE_RESULT-1] != NULL && + RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) + return -EINVAL; + if (tb[TCA_POLICE_RESULT-1] != NULL && + RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) + return -EINVAL; + + if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) { + a->priv = p; + if (bind) { + p->bindcnt += 1; + p->refcnt += 1; + } + if (ovr) + goto override; + return ret; + } + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) + return -ENOMEM; + memset(p, 0, sizeof(*p)); + + ret = ACT_P_CREATED; + p->refcnt = 1; + spin_lock_init(&p->lock); + p->stats_lock = &p->lock; + if (bind) + p->bindcnt = 1; +override: + if (parm->rate.rate) { + err = -ENOMEM; + R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]); + if (R_tab == NULL) + goto failure; + if (parm->peakrate.rate) { + P_tab = qdisc_get_rtab(&parm->peakrate, + tb[TCA_POLICE_PEAKRATE-1]); + if (p->P_tab == NULL) { + qdisc_put_rtab(R_tab); + goto failure; + } + } + } + /* No failure allowed after this point */ + spin_lock_bh(&p->lock); + if (R_tab != NULL) { + qdisc_put_rtab(p->R_tab); + p->R_tab = R_tab; + } + if (P_tab != NULL) { + qdisc_put_rtab(p->P_tab); + p->P_tab = P_tab; + } + + if (tb[TCA_POLICE_RESULT-1]) + p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]); + p->toks = p->burst = parm->burst; + p->mtu = parm->mtu; + if (p->mtu == 0) { + p->mtu = ~0; + if (p->R_tab) + p->mtu = 255<<p->R_tab->rate.cell_log; + } + if (p->P_tab) + p->ptoks = L2T_P(p, p->mtu); + p->action = parm->action; + +#ifdef CONFIG_NET_ESTIMATOR + if (tb[TCA_POLICE_AVRATE-1]) + p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]); + if (est) + gen_replace_estimator(&p->bstats, &p->rate_est, p->stats_lock, est); +#endif + + spin_unlock_bh(&p->lock); + if (ret != ACT_P_CREATED) + return ret; + + PSCHED_GET_TIME(p->t_c); + p->index = parm->index ? : tcf_police_new_index(); + h = tcf_police_hash(p->index); + write_lock_bh(&police_lock); + p->next = tcf_police_ht[h]; + tcf_police_ht[h] = p; + write_unlock_bh(&police_lock); + + a->priv = p; + return ret; + +failure: + if (ret == ACT_P_CREATED) + kfree(p); + return err; +} + +static int tcf_act_police_cleanup(struct tc_action *a, int bind) +{ + struct tcf_police *p = PRIV(a); + + if (p != NULL) + return tcf_police_release(p, bind); + return 0; +} + +static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a) +{ + psched_time_t now; + struct sk_buff *skb = *pskb; + struct tcf_police *p = PRIV(a); + long toks; + long ptoks = 0; + + spin_lock(&p->lock); + + p->bstats.bytes += skb->len; + p->bstats.packets++; + +#ifdef CONFIG_NET_ESTIMATOR + if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) { + p->qstats.overlimits++; + spin_unlock(&p->lock); + return p->action; + } +#endif + + if (skb->len <= p->mtu) { + if (p->R_tab == NULL) { + spin_unlock(&p->lock); + return p->result; + } + + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst); + + if (p->P_tab) { + ptoks = toks + p->ptoks; + if (ptoks > (long)L2T_P(p, p->mtu)) + ptoks = (long)L2T_P(p, p->mtu); + ptoks -= L2T_P(p, skb->len); + } + toks += p->toks; + if (toks > (long)p->burst) + toks = p->burst; + toks -= L2T(p, skb->len); + + if ((toks|ptoks) >= 0) { + p->t_c = now; + p->toks = toks; + p->ptoks = ptoks; + spin_unlock(&p->lock); + return p->result; + } + } + + p->qstats.overlimits++; + spin_unlock(&p->lock); + return p->action; +} + +static int +tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb->tail; + struct tc_police opt; + struct tcf_police *p = PRIV(a); + + opt.index = p->index; + opt.action = p->action; + opt.mtu = p->mtu; + opt.burst = p->burst; + opt.refcnt = p->refcnt - ref; + opt.bindcnt = p->bindcnt - bind; + if (p->R_tab) + opt.rate = p->R_tab->rate; + else + memset(&opt.rate, 0, sizeof(opt.rate)); + if (p->P_tab) + opt.peakrate = p->P_tab->rate; + else + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); + RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); + if (p->result) + RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result); +#ifdef CONFIG_NET_ESTIMATOR + if (p->ewma_rate) + RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate); +#endif + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +MODULE_AUTHOR("Alexey Kuznetsov"); +MODULE_DESCRIPTION("Policing actions"); +MODULE_LICENSE("GPL"); + +static struct tc_action_ops act_police_ops = { + .kind = "police", + .type = TCA_ID_POLICE, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_act_police, + .dump = tcf_act_police_dump, + .cleanup = tcf_act_police_cleanup, + .lookup = tcf_hash_search, + .init = tcf_act_police_locate, + .walk = tcf_generic_walker +}; + +static int __init +police_init_module(void) +{ + return tcf_register_action(&act_police_ops); +} + +static void __exit +police_cleanup_module(void) +{ + tcf_unregister_action(&act_police_ops); +} + +module_init(police_init_module); +module_exit(police_cleanup_module); + +#endif + +struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est) +{ + unsigned h; + struct tcf_police *p; + struct rtattr *tb[TCA_POLICE_MAX]; + struct tc_police *parm; + + if (rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0) + return NULL; + + if (tb[TCA_POLICE_TBF-1] == NULL || + RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm)) + return NULL; + + parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); + + if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) { + p->refcnt++; + return p; + } + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) + return NULL; + + memset(p, 0, sizeof(*p)); + p->refcnt = 1; + spin_lock_init(&p->lock); + p->stats_lock = &p->lock; + if (parm->rate.rate) { + p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]); + if (p->R_tab == NULL) + goto failure; + if (parm->peakrate.rate) { + p->P_tab = qdisc_get_rtab(&parm->peakrate, + tb[TCA_POLICE_PEAKRATE-1]); + if (p->P_tab == NULL) + goto failure; + } + } + if (tb[TCA_POLICE_RESULT-1]) { + if (RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) + goto failure; + p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]); + } +#ifdef CONFIG_NET_ESTIMATOR + if (tb[TCA_POLICE_AVRATE-1]) { + if (RTA_PAYLOAD(tb[TCA_POLICE_AVRATE-1]) != sizeof(u32)) + goto failure; + p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]); + } +#endif + p->toks = p->burst = parm->burst; + p->mtu = parm->mtu; + if (p->mtu == 0) { + p->mtu = ~0; + if (p->R_tab) + p->mtu = 255<<p->R_tab->rate.cell_log; + } + if (p->P_tab) + p->ptoks = L2T_P(p, p->mtu); + PSCHED_GET_TIME(p->t_c); + p->index = parm->index ? : tcf_police_new_index(); + p->action = parm->action; +#ifdef CONFIG_NET_ESTIMATOR + if (est) + gen_new_estimator(&p->bstats, &p->rate_est, p->stats_lock, est); +#endif + h = tcf_police_hash(p->index); + write_lock_bh(&police_lock); + p->next = tcf_police_ht[h]; + tcf_police_ht[h] = p; + write_unlock_bh(&police_lock); + return p; + +failure: + if (p->R_tab) + qdisc_put_rtab(p->R_tab); + kfree(p); + return NULL; +} + +int tcf_police(struct sk_buff *skb, struct tcf_police *p) +{ + psched_time_t now; + long toks; + long ptoks = 0; + + spin_lock(&p->lock); + + p->bstats.bytes += skb->len; + p->bstats.packets++; + +#ifdef CONFIG_NET_ESTIMATOR + if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) { + p->qstats.overlimits++; + spin_unlock(&p->lock); + return p->action; + } +#endif + + if (skb->len <= p->mtu) { + if (p->R_tab == NULL) { + spin_unlock(&p->lock); + return p->result; + } + + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst); + + if (p->P_tab) { + ptoks = toks + p->ptoks; + if (ptoks > (long)L2T_P(p, p->mtu)) + ptoks = (long)L2T_P(p, p->mtu); + ptoks -= L2T_P(p, skb->len); + } + toks += p->toks; + if (toks > (long)p->burst) + toks = p->burst; + toks -= L2T(p, skb->len); + + if ((toks|ptoks) >= 0) { + p->t_c = now; + p->toks = toks; + p->ptoks = ptoks; + spin_unlock(&p->lock); + return p->result; + } + } + + p->qstats.overlimits++; + spin_unlock(&p->lock); + return p->action; +} + +int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p) +{ + unsigned char *b = skb->tail; + struct tc_police opt; + + opt.index = p->index; + opt.action = p->action; + opt.mtu = p->mtu; + opt.burst = p->burst; + if (p->R_tab) + opt.rate = p->R_tab->rate; + else + memset(&opt.rate, 0, sizeof(opt.rate)); + if (p->P_tab) + opt.peakrate = p->P_tab->rate; + else + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); + RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); + if (p->result) + RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result); +#ifdef CONFIG_NET_ESTIMATOR + if (p->ewma_rate) + RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate); +#endif + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +int tcf_police_dump_stats(struct sk_buff *skb, struct tcf_police *p) +{ + struct gnet_dump d; + + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, + TCA_XSTATS, p->stats_lock, &d) < 0) + goto errout; + + if (gnet_stats_copy_basic(&d, &p->bstats) < 0 || +#ifdef CONFIG_NET_ESTIMATOR + gnet_stats_copy_rate_est(&d, &p->rate_est) < 0 || +#endif + gnet_stats_copy_queue(&d, &p->qstats) < 0) + goto errout; + + if (gnet_stats_finish_copy(&d) < 0) + goto errout; + + return 0; + +errout: + return -1; +} + + +EXPORT_SYMBOL(tcf_police); +EXPORT_SYMBOL(tcf_police_destroy); +EXPORT_SYMBOL(tcf_police_dump); +EXPORT_SYMBOL(tcf_police_dump_stats); +EXPORT_SYMBOL(tcf_police_hash); +EXPORT_SYMBOL(tcf_police_ht); +EXPORT_SYMBOL(tcf_police_locate); +EXPORT_SYMBOL(tcf_police_lookup); +EXPORT_SYMBOL(tcf_police_new_index); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c new file mode 100644 index 00000000000..4323a74eea3 --- /dev/null +++ b/net/sched/sch_api.c @@ -0,0 +1,1296 @@ +/* + * net/sched/sch_api.c Packet scheduler API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Fixes: + * + * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. + * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support + * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/kmod.h> +#include <linux/list.h> +#include <linux/bitops.h> + +#include <net/sock.h> +#include <net/pkt_sched.h> + +#include <asm/processor.h> +#include <asm/uaccess.h> +#include <asm/system.h> + +static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, + struct Qdisc *old, struct Qdisc *new); +static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *q, unsigned long cl, int event); + +/* + + Short review. + ------------- + + This file consists of two interrelated parts: + + 1. queueing disciplines manager frontend. + 2. traffic classes manager frontend. + + Generally, queueing discipline ("qdisc") is a black box, + which is able to enqueue packets and to dequeue them (when + device is ready to send something) in order and at times + determined by algorithm hidden in it. + + qdisc's are divided to two categories: + - "queues", which have no internal structure visible from outside. + - "schedulers", which split all the packets to "traffic classes", + using "packet classifiers" (look at cls_api.c) + + In turn, classes may have child qdiscs (as rule, queues) + attached to them etc. etc. etc. + + The goal of the routines in this file is to translate + information supplied by user in the form of handles + to more intelligible for kernel form, to make some sanity + checks and part of work, which is common to all qdiscs + and to provide rtnetlink notifications. + + All real intelligent work is done inside qdisc modules. + + + + Every discipline has two major routines: enqueue and dequeue. + + ---dequeue + + dequeue usually returns a skb to send. It is allowed to return NULL, + but it does not mean that queue is empty, it just means that + discipline does not want to send anything this time. + Queue is really empty if q->q.qlen == 0. + For complicated disciplines with multiple queues q->q is not + real packet queue, but however q->q.qlen must be valid. + + ---enqueue + + enqueue returns 0, if packet was enqueued successfully. + If packet (this one or another one) was dropped, it returns + not zero error code. + NET_XMIT_DROP - this packet dropped + Expected action: do not backoff, but wait until queue will clear. + NET_XMIT_CN - probably this packet enqueued, but another one dropped. + Expected action: backoff or ignore + NET_XMIT_POLICED - dropped by police. + Expected action: backoff or error to real-time apps. + + Auxiliary routines: + + ---requeue + + requeues once dequeued packet. It is used for non-standard or + just buggy devices, which can defer output even if dev->tbusy=0. + + ---reset + + returns qdisc to initial state: purge all buffers, clear all + timers, counters (except for statistics) etc. + + ---init + + initializes newly created qdisc. + + ---destroy + + destroys resources allocated by init and during lifetime of qdisc. + + ---change + + changes qdisc parameters. + */ + +/* Protects list of registered TC modules. It is pure SMP lock. */ +static DEFINE_RWLOCK(qdisc_mod_lock); + + +/************************************************ + * Queueing disciplines manipulation. * + ************************************************/ + + +/* The list of all installed queueing disciplines. */ + +static struct Qdisc_ops *qdisc_base; + +/* Register/uregister queueing discipline */ + +int register_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + int rc = -EEXIST; + + write_lock(&qdisc_mod_lock); + for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) + if (!strcmp(qops->id, q->id)) + goto out; + + if (qops->enqueue == NULL) + qops->enqueue = noop_qdisc_ops.enqueue; + if (qops->requeue == NULL) + qops->requeue = noop_qdisc_ops.requeue; + if (qops->dequeue == NULL) + qops->dequeue = noop_qdisc_ops.dequeue; + + qops->next = NULL; + *qp = qops; + rc = 0; +out: + write_unlock(&qdisc_mod_lock); + return rc; +} + +int unregister_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + int err = -ENOENT; + + write_lock(&qdisc_mod_lock); + for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) + if (q == qops) + break; + if (q) { + *qp = q->next; + q->next = NULL; + err = 0; + } + write_unlock(&qdisc_mod_lock); + return err; +} + +/* We know handle. Find qdisc among all qdisc's attached to device + (root qdisc, all its children, children of children etc.) + */ + +struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) +{ + struct Qdisc *q; + + read_lock_bh(&qdisc_tree_lock); + list_for_each_entry(q, &dev->qdisc_list, list) { + if (q->handle == handle) { + read_unlock_bh(&qdisc_tree_lock); + return q; + } + } + read_unlock_bh(&qdisc_tree_lock); + return NULL; +} + +static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) +{ + unsigned long cl; + struct Qdisc *leaf; + struct Qdisc_class_ops *cops = p->ops->cl_ops; + + if (cops == NULL) + return NULL; + cl = cops->get(p, classid); + + if (cl == 0) + return NULL; + leaf = cops->leaf(p, cl); + cops->put(p, cl); + return leaf; +} + +/* Find queueing discipline by name */ + +static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) +{ + struct Qdisc_ops *q = NULL; + + if (kind) { + read_lock(&qdisc_mod_lock); + for (q = qdisc_base; q; q = q->next) { + if (rtattr_strcmp(kind, q->id) == 0) { + if (!try_module_get(q->owner)) + q = NULL; + break; + } + } + read_unlock(&qdisc_mod_lock); + } + return q; +} + +static struct qdisc_rate_table *qdisc_rtab_list; + +struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) +{ + struct qdisc_rate_table *rtab; + + for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { + if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { + rtab->refcnt++; + return rtab; + } + } + + if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) + return NULL; + + rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); + if (rtab) { + rtab->rate = *r; + rtab->refcnt = 1; + memcpy(rtab->data, RTA_DATA(tab), 1024); + rtab->next = qdisc_rtab_list; + qdisc_rtab_list = rtab; + } + return rtab; +} + +void qdisc_put_rtab(struct qdisc_rate_table *tab) +{ + struct qdisc_rate_table *rtab, **rtabp; + + if (!tab || --tab->refcnt) + return; + + for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { + if (rtab == tab) { + *rtabp = rtab->next; + kfree(rtab); + return; + } + } +} + + +/* Allocate an unique handle from space managed by kernel */ + +static u32 qdisc_alloc_handle(struct net_device *dev) +{ + int i = 0x10000; + static u32 autohandle = TC_H_MAKE(0x80000000U, 0); + + do { + autohandle += TC_H_MAKE(0x10000U, 0); + if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) + autohandle = TC_H_MAKE(0x80000000U, 0); + } while (qdisc_lookup(dev, autohandle) && --i > 0); + + return i>0 ? autohandle : 0; +} + +/* Attach toplevel qdisc to device dev */ + +static struct Qdisc * +dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) +{ + struct Qdisc *oqdisc; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + qdisc_lock_tree(dev); + if (qdisc && qdisc->flags&TCQ_F_INGRESS) { + oqdisc = dev->qdisc_ingress; + /* Prune old scheduler */ + if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) { + /* delete */ + qdisc_reset(oqdisc); + dev->qdisc_ingress = NULL; + } else { /* new */ + dev->qdisc_ingress = qdisc; + } + + } else { + + oqdisc = dev->qdisc_sleeping; + + /* Prune old scheduler */ + if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) + qdisc_reset(oqdisc); + + /* ... and graft new one */ + if (qdisc == NULL) + qdisc = &noop_qdisc; + dev->qdisc_sleeping = qdisc; + dev->qdisc = &noop_qdisc; + } + + qdisc_unlock_tree(dev); + + if (dev->flags & IFF_UP) + dev_activate(dev); + + return oqdisc; +} + + +/* Graft qdisc "new" to class "classid" of qdisc "parent" or + to device "dev". + + Old qdisc is not destroyed but returned in *old. + */ + +static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, + u32 classid, + struct Qdisc *new, struct Qdisc **old) +{ + int err = 0; + struct Qdisc *q = *old; + + + if (parent == NULL) { + if (q && q->flags&TCQ_F_INGRESS) { + *old = dev_graft_qdisc(dev, q); + } else { + *old = dev_graft_qdisc(dev, new); + } + } else { + struct Qdisc_class_ops *cops = parent->ops->cl_ops; + + err = -EINVAL; + + if (cops) { + unsigned long cl = cops->get(parent, classid); + if (cl) { + err = cops->graft(parent, cl, new, old); + if (new) + new->parent = classid; + cops->put(parent, cl); + } + } + } + return err; +} + +/* + Allocate and initialize new qdisc. + + Parameters are passed via opt. + */ + +static struct Qdisc * +qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) +{ + int err; + struct rtattr *kind = tca[TCA_KIND-1]; + void *p = NULL; + struct Qdisc *sch; + struct Qdisc_ops *ops; + int size; + + ops = qdisc_lookup_ops(kind); +#ifdef CONFIG_KMOD + if (ops == NULL && kind != NULL) { + char name[IFNAMSIZ]; + if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { + /* We dropped the RTNL semaphore in order to + * perform the module load. So, even if we + * succeeded in loading the module we have to + * tell the caller to replay the request. We + * indicate this using -EAGAIN. + * We replay the request because the device may + * go away in the mean time. + */ + rtnl_unlock(); + request_module("sch_%s", name); + rtnl_lock(); + ops = qdisc_lookup_ops(kind); + if (ops != NULL) { + /* We will try again qdisc_lookup_ops, + * so don't keep a reference. + */ + module_put(ops->owner); + err = -EAGAIN; + goto err_out; + } + } + } +#endif + + err = -EINVAL; + if (ops == NULL) + goto err_out; + + /* ensure that the Qdisc and the private data are 32-byte aligned */ + size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); + size += ops->priv_size + QDISC_ALIGN_CONST; + + p = kmalloc(size, GFP_KERNEL); + err = -ENOBUFS; + if (!p) + goto err_out2; + memset(p, 0, size); + sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) + & ~QDISC_ALIGN_CONST); + sch->padded = (char *)sch - (char *)p; + + INIT_LIST_HEAD(&sch->list); + skb_queue_head_init(&sch->q); + + if (handle == TC_H_INGRESS) + sch->flags |= TCQ_F_INGRESS; + + sch->ops = ops; + sch->enqueue = ops->enqueue; + sch->dequeue = ops->dequeue; + sch->dev = dev; + dev_hold(dev); + atomic_set(&sch->refcnt, 1); + sch->stats_lock = &dev->queue_lock; + if (handle == 0) { + handle = qdisc_alloc_handle(dev); + err = -ENOMEM; + if (handle == 0) + goto err_out3; + } + + if (handle == TC_H_INGRESS) + sch->handle =TC_H_MAKE(TC_H_INGRESS, 0); + else + sch->handle = handle; + + if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { + qdisc_lock_tree(dev); + list_add_tail(&sch->list, &dev->qdisc_list); + qdisc_unlock_tree(dev); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_new_estimator(&sch->bstats, &sch->rate_est, + sch->stats_lock, tca[TCA_RATE-1]); +#endif + return sch; + } +err_out3: + dev_put(dev); +err_out2: + module_put(ops->owner); +err_out: + *errp = err; + if (p) + kfree(p); + return NULL; +} + +static int qdisc_change(struct Qdisc *sch, struct rtattr **tca) +{ + if (tca[TCA_OPTIONS-1]) { + int err; + + if (sch->ops->change == NULL) + return -EINVAL; + err = sch->ops->change(sch, tca[TCA_OPTIONS-1]); + if (err) + return err; + } +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_replace_estimator(&sch->bstats, &sch->rate_est, + sch->stats_lock, tca[TCA_RATE-1]); +#endif + return 0; +} + +struct check_loop_arg +{ + struct qdisc_walker w; + struct Qdisc *p; + int depth; +}; + +static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); + +static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) +{ + struct check_loop_arg arg; + + if (q->ops->cl_ops == NULL) + return 0; + + arg.w.stop = arg.w.skip = arg.w.count = 0; + arg.w.fn = check_loop_fn; + arg.depth = depth; + arg.p = p; + q->ops->cl_ops->walk(q, &arg.w); + return arg.w.stop ? -ELOOP : 0; +} + +static int +check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) +{ + struct Qdisc *leaf; + struct Qdisc_class_ops *cops = q->ops->cl_ops; + struct check_loop_arg *arg = (struct check_loop_arg *)w; + + leaf = cops->leaf(q, cl); + if (leaf) { + if (leaf == arg->p || arg->depth > 7) + return -ELOOP; + return check_loop(leaf, arg->p, arg->depth + 1); + } + return 0; +} + +/* + * Delete/get qdisc. + */ + +static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct tcmsg *tcm = NLMSG_DATA(n); + struct rtattr **tca = arg; + struct net_device *dev; + u32 clid = tcm->tcm_parent; + struct Qdisc *q = NULL; + struct Qdisc *p = NULL; + int err; + + if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return -ENODEV; + + if (clid) { + if (clid != TC_H_ROOT) { + if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { + if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) + return -ENOENT; + q = qdisc_leaf(p, clid); + } else { /* ingress */ + q = dev->qdisc_ingress; + } + } else { + q = dev->qdisc_sleeping; + } + if (!q) + return -ENOENT; + + if (tcm->tcm_handle && q->handle != tcm->tcm_handle) + return -EINVAL; + } else { + if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) + return -ENOENT; + } + + if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + return -EINVAL; + + if (n->nlmsg_type == RTM_DELQDISC) { + if (!clid) + return -EINVAL; + if (q->handle == 0) + return -ENOENT; + if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0) + return err; + if (q) { + qdisc_notify(skb, n, clid, q, NULL); + spin_lock_bh(&dev->queue_lock); + qdisc_destroy(q); + spin_unlock_bh(&dev->queue_lock); + } + } else { + qdisc_notify(skb, n, clid, NULL, q); + } + return 0; +} + +/* + Create/change qdisc. + */ + +static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct tcmsg *tcm; + struct rtattr **tca; + struct net_device *dev; + u32 clid; + struct Qdisc *q, *p; + int err; + +replay: + /* Reinit, just in case something touches this. */ + tcm = NLMSG_DATA(n); + tca = arg; + clid = tcm->tcm_parent; + q = p = NULL; + + if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return -ENODEV; + + if (clid) { + if (clid != TC_H_ROOT) { + if (clid != TC_H_INGRESS) { + if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) + return -ENOENT; + q = qdisc_leaf(p, clid); + } else { /*ingress */ + q = dev->qdisc_ingress; + } + } else { + q = dev->qdisc_sleeping; + } + + /* It may be default qdisc, ignore it */ + if (q && q->handle == 0) + q = NULL; + + if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { + if (tcm->tcm_handle) { + if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) + return -EEXIST; + if (TC_H_MIN(tcm->tcm_handle)) + return -EINVAL; + if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) + goto create_n_graft; + if (n->nlmsg_flags&NLM_F_EXCL) + return -EEXIST; + if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + return -EINVAL; + if (q == p || + (p && check_loop(q, p, 0))) + return -ELOOP; + atomic_inc(&q->refcnt); + goto graft; + } else { + if (q == NULL) + goto create_n_graft; + + /* This magic test requires explanation. + * + * We know, that some child q is already + * attached to this parent and have choice: + * either to change it or to create/graft new one. + * + * 1. We are allowed to create/graft only + * if CREATE and REPLACE flags are set. + * + * 2. If EXCL is set, requestor wanted to say, + * that qdisc tcm_handle is not expected + * to exist, so that we choose create/graft too. + * + * 3. The last case is when no flags are set. + * Alas, it is sort of hole in API, we + * cannot decide what to do unambiguously. + * For now we select create/graft, if + * user gave KIND, which does not match existing. + */ + if ((n->nlmsg_flags&NLM_F_CREATE) && + (n->nlmsg_flags&NLM_F_REPLACE) && + ((n->nlmsg_flags&NLM_F_EXCL) || + (tca[TCA_KIND-1] && + rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)))) + goto create_n_graft; + } + } + } else { + if (!tcm->tcm_handle) + return -EINVAL; + q = qdisc_lookup(dev, tcm->tcm_handle); + } + + /* Change qdisc parameters */ + if (q == NULL) + return -ENOENT; + if (n->nlmsg_flags&NLM_F_EXCL) + return -EEXIST; + if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + return -EINVAL; + err = qdisc_change(q, tca); + if (err == 0) + qdisc_notify(skb, n, clid, NULL, q); + return err; + +create_n_graft: + if (!(n->nlmsg_flags&NLM_F_CREATE)) + return -ENOENT; + if (clid == TC_H_INGRESS) + q = qdisc_create(dev, tcm->tcm_parent, tca, &err); + else + q = qdisc_create(dev, tcm->tcm_handle, tca, &err); + if (q == NULL) { + if (err == -EAGAIN) + goto replay; + return err; + } + +graft: + if (1) { + struct Qdisc *old_q = NULL; + err = qdisc_graft(dev, p, clid, q, &old_q); + if (err) { + if (q) { + spin_lock_bh(&dev->queue_lock); + qdisc_destroy(q); + spin_unlock_bh(&dev->queue_lock); + } + return err; + } + qdisc_notify(skb, n, clid, old_q, q); + if (old_q) { + spin_lock_bh(&dev->queue_lock); + qdisc_destroy(old_q); + spin_unlock_bh(&dev->queue_lock); + } + } + return 0; +} + +static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, + u32 pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct gnet_dump d; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = q->dev->ifindex; + tcm->tcm_parent = clid; + tcm->tcm_handle = q->handle; + tcm->tcm_info = atomic_read(&q->refcnt); + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + if (q->ops->dump && q->ops->dump(q, skb) < 0) + goto rtattr_failure; + q->qstats.qlen = q->q.qlen; + + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, + TCA_XSTATS, q->stats_lock, &d) < 0) + goto rtattr_failure; + + if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) + goto rtattr_failure; + + if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || +#ifdef CONFIG_NET_ESTIMATOR + gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || +#endif + gnet_stats_copy_queue(&d, &q->qstats) < 0) + goto rtattr_failure; + + if (gnet_stats_finish_copy(&d) < 0) + goto rtattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, + u32 clid, struct Qdisc *old, struct Qdisc *new) +{ + struct sk_buff *skb; + u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (old && old->handle) { + if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) + goto err_out; + } + if (new) { + if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) + goto err_out; + } + + if (skb->len) + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + +err_out: + kfree_skb(skb); + return -EINVAL; +} + +static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, q_idx; + int s_idx, s_q_idx; + struct net_device *dev; + struct Qdisc *q; + + s_idx = cb->args[0]; + s_q_idx = q_idx = cb->args[1]; + read_lock(&dev_base_lock); + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_q_idx = 0; + read_lock_bh(&qdisc_tree_lock); + q_idx = 0; + list_for_each_entry(q, &dev->qdisc_list, list) { + if (q_idx < s_q_idx) { + q_idx++; + continue; + } + if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { + read_unlock_bh(&qdisc_tree_lock); + goto done; + } + q_idx++; + } + read_unlock_bh(&qdisc_tree_lock); + } + +done: + read_unlock(&dev_base_lock); + + cb->args[0] = idx; + cb->args[1] = q_idx; + + return skb->len; +} + + + +/************************************************ + * Traffic classes manipulation. * + ************************************************/ + + + +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct tcmsg *tcm = NLMSG_DATA(n); + struct rtattr **tca = arg; + struct net_device *dev; + struct Qdisc *q = NULL; + struct Qdisc_class_ops *cops; + unsigned long cl = 0; + unsigned long new_cl; + u32 pid = tcm->tcm_parent; + u32 clid = tcm->tcm_handle; + u32 qid = TC_H_MAJ(clid); + int err; + + if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return -ENODEV; + + /* + parent == TC_H_UNSPEC - unspecified parent. + parent == TC_H_ROOT - class is root, which has no parent. + parent == X:0 - parent is root class. + parent == X:Y - parent is a node in hierarchy. + parent == 0:Y - parent is X:Y, where X:0 is qdisc. + + handle == 0:0 - generate handle from kernel pool. + handle == 0:Y - class is X:Y, where X:0 is qdisc. + handle == X:Y - clear. + handle == X:0 - root class. + */ + + /* Step 1. Determine qdisc handle X:0 */ + + if (pid != TC_H_ROOT) { + u32 qid1 = TC_H_MAJ(pid); + + if (qid && qid1) { + /* If both majors are known, they must be identical. */ + if (qid != qid1) + return -EINVAL; + } else if (qid1) { + qid = qid1; + } else if (qid == 0) + qid = dev->qdisc_sleeping->handle; + + /* Now qid is genuine qdisc handle consistent + both with parent and child. + + TC_H_MAJ(pid) still may be unspecified, complete it now. + */ + if (pid) + pid = TC_H_MAKE(qid, pid); + } else { + if (qid == 0) + qid = dev->qdisc_sleeping->handle; + } + + /* OK. Locate qdisc */ + if ((q = qdisc_lookup(dev, qid)) == NULL) + return -ENOENT; + + /* An check that it supports classes */ + cops = q->ops->cl_ops; + if (cops == NULL) + return -EINVAL; + + /* Now try to get class */ + if (clid == 0) { + if (pid == TC_H_ROOT) + clid = qid; + } else + clid = TC_H_MAKE(qid, clid); + + if (clid) + cl = cops->get(q, clid); + + if (cl == 0) { + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) + goto out; + } else { + switch (n->nlmsg_type) { + case RTM_NEWTCLASS: + err = -EEXIST; + if (n->nlmsg_flags&NLM_F_EXCL) + goto out; + break; + case RTM_DELTCLASS: + err = cops->delete(q, cl); + if (err == 0) + tclass_notify(skb, n, q, cl, RTM_DELTCLASS); + goto out; + case RTM_GETTCLASS: + err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); + goto out; + default: + err = -EINVAL; + goto out; + } + } + + new_cl = cl; + err = cops->change(q, clid, pid, tca, &new_cl); + if (err == 0) + tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); + +out: + if (cl) + cops->put(q, cl); + + return err; +} + + +static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, + unsigned long cl, + u32 pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct gnet_dump d; + struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = q->dev->ifindex; + tcm->tcm_parent = q->handle; + tcm->tcm_handle = q->handle; + tcm->tcm_info = 0; + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) + goto rtattr_failure; + + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, + TCA_XSTATS, q->stats_lock, &d) < 0) + goto rtattr_failure; + + if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) + goto rtattr_failure; + + if (gnet_stats_finish_copy(&d) < 0) + goto rtattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *q, unsigned long cl, int event) +{ + struct sk_buff *skb; + u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +} + +struct qdisc_dump_args +{ + struct qdisc_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) +{ + struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; + + return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); +} + +static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct net_device *dev; + struct Qdisc *q; + struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); + struct qdisc_dump_args arg; + + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + return 0; + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return 0; + + s_t = cb->args[0]; + t = 0; + + read_lock_bh(&qdisc_tree_lock); + list_for_each_entry(q, &dev->qdisc_list, list) { + if (t < s_t || !q->ops->cl_ops || + (tcm->tcm_parent && + TC_H_MAJ(tcm->tcm_parent) != q->handle)) { + t++; + continue; + } + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + arg.w.fn = qdisc_class_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1]; + arg.w.count = 0; + q->ops->cl_ops->walk(q, &arg.w); + cb->args[1] = arg.w.count; + if (arg.w.stop) + break; + t++; + } + read_unlock_bh(&qdisc_tree_lock); + + cb->args[0] = t; + + dev_put(dev); + return skb->len; +} + +/* Main classifier routine: scans classifier chain attached + to this qdisc, (optionally) tests for protocol and asks + specific classifiers. + */ +int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + int err = 0; + u32 protocol = skb->protocol; +#ifdef CONFIG_NET_CLS_ACT + struct tcf_proto *otp = tp; +reclassify: +#endif + protocol = skb->protocol; + + for ( ; tp; tp = tp->next) { + if ((tp->protocol == protocol || + tp->protocol == __constant_htons(ETH_P_ALL)) && + (err = tp->classify(skb, tp, res)) >= 0) { +#ifdef CONFIG_NET_CLS_ACT + if ( TC_ACT_RECLASSIFY == err) { + __u32 verd = (__u32) G_TC_VERD(skb->tc_verd); + tp = otp; + + if (MAX_REC_LOOP < verd++) { + printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n", + tp->prio&0xffff, ntohs(tp->protocol)); + return TC_ACT_SHOT; + } + skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd); + goto reclassify; + } else { + if (skb->tc_verd) + skb->tc_verd = SET_TC_VERD(skb->tc_verd,0); + return err; + } +#else + + return err; +#endif + } + + } + return -1; +} + +static int psched_us_per_tick = 1; +static int psched_tick_per_us = 1; + +#ifdef CONFIG_PROC_FS +static int psched_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%08x %08x %08x %08x\n", + psched_tick_per_us, psched_us_per_tick, + 1000000, HZ); + + return 0; +} + +static int psched_open(struct inode *inode, struct file *file) +{ + return single_open(file, psched_show, PDE(inode)->data); +} + +static struct file_operations psched_fops = { + .owner = THIS_MODULE, + .open = psched_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +#ifdef CONFIG_NET_SCH_CLK_CPU +psched_tdiff_t psched_clock_per_hz; +int psched_clock_scale; +EXPORT_SYMBOL(psched_clock_per_hz); +EXPORT_SYMBOL(psched_clock_scale); + +psched_time_t psched_time_base; +cycles_t psched_time_mark; +EXPORT_SYMBOL(psched_time_mark); +EXPORT_SYMBOL(psched_time_base); + +/* + * Periodically adjust psched_time_base to avoid overflow + * with 32-bit get_cycles(). Safe up to 4GHz CPU. + */ +static void psched_tick(unsigned long); +static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0); + +static void psched_tick(unsigned long dummy) +{ + if (sizeof(cycles_t) == sizeof(u32)) { + psched_time_t dummy_stamp; + PSCHED_GET_TIME(dummy_stamp); + psched_timer.expires = jiffies + 1*HZ; + add_timer(&psched_timer); + } +} + +int __init psched_calibrate_clock(void) +{ + psched_time_t stamp, stamp1; + struct timeval tv, tv1; + psched_tdiff_t delay; + long rdelay; + unsigned long stop; + + psched_tick(0); + stop = jiffies + HZ/10; + PSCHED_GET_TIME(stamp); + do_gettimeofday(&tv); + while (time_before(jiffies, stop)) { + barrier(); + cpu_relax(); + } + PSCHED_GET_TIME(stamp1); + do_gettimeofday(&tv1); + + delay = PSCHED_TDIFF(stamp1, stamp); + rdelay = tv1.tv_usec - tv.tv_usec; + rdelay += (tv1.tv_sec - tv.tv_sec)*1000000; + if (rdelay > delay) + return -1; + delay /= rdelay; + psched_tick_per_us = delay; + while ((delay>>=1) != 0) + psched_clock_scale++; + psched_us_per_tick = 1<<psched_clock_scale; + psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale; + return 0; +} +#endif + +static int __init pktsched_init(void) +{ + struct rtnetlink_link *link_p; + +#ifdef CONFIG_NET_SCH_CLK_CPU + if (psched_calibrate_clock() < 0) + return -1; +#elif defined(CONFIG_NET_SCH_CLK_JIFFIES) + psched_tick_per_us = HZ<<PSCHED_JSCALE; + psched_us_per_tick = 1000000; +#endif + + link_p = rtnetlink_links[PF_UNSPEC]; + + /* Setup rtnetlink links. It is made here to avoid + exporting large number of public symbols. + */ + + if (link_p) { + link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc; + link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc; + link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc; + link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc; + link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass; + link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass; + link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass; + link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass; + } + + register_qdisc(&pfifo_qdisc_ops); + register_qdisc(&bfifo_qdisc_ops); + proc_net_fops_create("psched", 0, &psched_fops); + + return 0; +} + +subsys_initcall(pktsched_init); + +EXPORT_SYMBOL(qdisc_get_rtab); +EXPORT_SYMBOL(qdisc_put_rtab); +EXPORT_SYMBOL(register_qdisc); +EXPORT_SYMBOL(unregister_qdisc); +EXPORT_SYMBOL(tc_classify); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c new file mode 100644 index 00000000000..93ebce40aca --- /dev/null +++ b/net/sched/sch_atm.c @@ -0,0 +1,735 @@ +/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */ + +/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ + + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/interrupt.h> +#include <linux/atmdev.h> +#include <linux/atmclip.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/file.h> /* for fput */ +#include <net/pkt_sched.h> +#include <net/sock.h> + + +extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */ + +#if 0 /* control */ +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +#if 0 /* data */ +#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define D2PRINTK(format,args...) +#endif + + +/* + * The ATM queuing discipline provides a framework for invoking classifiers + * (aka "filters"), which in turn select classes of this queuing discipline. + * Each class maps the flow(s) it is handling to a given VC. Multiple classes + * may share the same VC. + * + * When creating a class, VCs are specified by passing the number of the open + * socket descriptor by which the calling process references the VC. The kernel + * keeps the VC open at least until all classes using it are removed. + * + * In this file, most functions are named atm_tc_* to avoid confusion with all + * the atm_* in net/atm. This naming convention differs from what's used in the + * rest of net/sched. + * + * Known bugs: + * - sometimes messes up the IP stack + * - any manipulations besides the few operations described in the README, are + * untested and likely to crash the system + * - should lock the flow while there is data in the queue (?) + */ + + +#define PRIV(sch) qdisc_priv(sch) +#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back)) + + +struct atm_flow_data { + struct Qdisc *q; /* FIFO, TBF, etc. */ + struct tcf_proto *filter_list; + struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ + void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); /* chaining */ + struct atm_qdisc_data *parent; /* parent qdisc */ + struct socket *sock; /* for closing */ + u32 classid; /* x:y type ID */ + int ref; /* reference count */ + struct gnet_stats_basic bstats; + struct gnet_stats_queue qstats; + spinlock_t *stats_lock; + struct atm_flow_data *next; + struct atm_flow_data *excess; /* flow for excess traffic; + NULL to set CLP instead */ + int hdr_len; + unsigned char hdr[0]; /* header data; MUST BE LAST */ +}; + +struct atm_qdisc_data { + struct atm_flow_data link; /* unclassified skbs go here */ + struct atm_flow_data *flows; /* NB: "link" is also on this + list */ + struct tasklet_struct task; /* requeue tasklet */ +}; + + +/* ------------------------- Class/flow operations ------------------------- */ + + +static int find_flow(struct atm_qdisc_data *qdisc,struct atm_flow_data *flow) +{ + struct atm_flow_data *walk; + + DPRINTK("find_flow(qdisc %p,flow %p)\n",qdisc,flow); + for (walk = qdisc->flows; walk; walk = walk->next) + if (walk == flow) return 1; + DPRINTK("find_flow: not found\n"); + return 0; +} + + +static __inline__ struct atm_flow_data *lookup_flow(struct Qdisc *sch, + u32 classid) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + + for (flow = p->flows; flow; flow = flow->next) + if (flow->classid == classid) break; + return flow; +} + + +static int atm_tc_graft(struct Qdisc *sch,unsigned long arg, + struct Qdisc *new,struct Qdisc **old) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) arg; + + DPRINTK("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",sch, + p,flow,new,old); + if (!find_flow(p,flow)) return -EINVAL; + if (!new) new = &noop_qdisc; + *old = xchg(&flow->q,new); + if (*old) qdisc_reset(*old); + return 0; +} + + +static struct Qdisc *atm_tc_leaf(struct Qdisc *sch,unsigned long cl) +{ + struct atm_flow_data *flow = (struct atm_flow_data *) cl; + + DPRINTK("atm_tc_leaf(sch %p,flow %p)\n",sch,flow); + return flow ? flow->q : NULL; +} + + +static unsigned long atm_tc_get(struct Qdisc *sch,u32 classid) +{ + struct atm_qdisc_data *p __attribute__((unused)) = PRIV(sch); + struct atm_flow_data *flow; + + DPRINTK("atm_tc_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid); + flow = lookup_flow(sch,classid); + if (flow) flow->ref++; + DPRINTK("atm_tc_get: flow %p\n",flow); + return (unsigned long) flow; +} + + +static unsigned long atm_tc_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return atm_tc_get(sch,classid); +} + + +static void destroy_filters(struct atm_flow_data *flow) +{ + struct tcf_proto *filter; + + while ((filter = flow->filter_list)) { + DPRINTK("destroy_filters: destroying filter %p\n",filter); + flow->filter_list = filter->next; + tcf_destroy(filter); + } +} + + +/* + * atm_tc_put handles all destructions, including the ones that are explicitly + * requested (atm_tc_destroy, etc.). The assumption here is that we never drop + * anything that still seems to be in use. + */ + +static void atm_tc_put(struct Qdisc *sch, unsigned long cl) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) cl; + struct atm_flow_data **prev; + + DPRINTK("atm_tc_put(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); + if (--flow->ref) return; + DPRINTK("atm_tc_put: destroying\n"); + for (prev = &p->flows; *prev; prev = &(*prev)->next) + if (*prev == flow) break; + if (!*prev) { + printk(KERN_CRIT "atm_tc_put: class %p not found\n",flow); + return; + } + *prev = flow->next; + DPRINTK("atm_tc_put: qdisc %p\n",flow->q); + qdisc_destroy(flow->q); + destroy_filters(flow); + if (flow->sock) { + DPRINTK("atm_tc_put: f_count %d\n", + file_count(flow->sock->file)); + flow->vcc->pop = flow->old_pop; + sockfd_put(flow->sock); + } + if (flow->excess) atm_tc_put(sch,(unsigned long) flow->excess); + if (flow != &p->link) kfree(flow); + /* + * If flow == &p->link, the qdisc no longer works at this point and + * needs to be removed. (By the caller of atm_tc_put.) + */ +} + + +static void sch_atm_pop(struct atm_vcc *vcc,struct sk_buff *skb) +{ + struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent; + + D2PRINTK("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n",vcc,skb,p); + VCC2FLOW(vcc)->old_pop(vcc,skb); + tasklet_schedule(&p->task); +} + +static const u8 llc_oui_ip[] = { + 0xaa, /* DSAP: non-ISO */ + 0xaa, /* SSAP: non-ISO */ + 0x03, /* Ctrl: Unnumbered Information Command PDU */ + 0x00, /* OUI: EtherType */ + 0x00, 0x00, + 0x08, 0x00 }; /* Ethertype IP (0800) */ + +static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, + struct rtattr **tca, unsigned long *arg) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) *arg; + struct atm_flow_data *excess = NULL; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_ATM_MAX]; + struct socket *sock; + int fd,error,hdr_len; + void *hdr; + + DPRINTK("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x," + "flow %p,opt %p)\n",sch,p,classid,parent,flow,opt); + /* + * The concept of parents doesn't apply for this qdisc. + */ + if (parent && parent != TC_H_ROOT && parent != sch->handle) + return -EINVAL; + /* + * ATM classes cannot be changed. In order to change properties of the + * ATM connection, that socket needs to be modified directly (via the + * native ATM API. In order to send a flow to a different VC, the old + * class needs to be removed and a new one added. (This may be changed + * later.) + */ + if (flow) return -EBUSY; + if (opt == NULL || rtattr_parse_nested(tb, TCA_ATM_MAX, opt)) + return -EINVAL; + if (!tb[TCA_ATM_FD-1] || RTA_PAYLOAD(tb[TCA_ATM_FD-1]) < sizeof(fd)) + return -EINVAL; + fd = *(int *) RTA_DATA(tb[TCA_ATM_FD-1]); + DPRINTK("atm_tc_change: fd %d\n",fd); + if (tb[TCA_ATM_HDR-1]) { + hdr_len = RTA_PAYLOAD(tb[TCA_ATM_HDR-1]); + hdr = RTA_DATA(tb[TCA_ATM_HDR-1]); + } + else { + hdr_len = RFC1483LLC_LEN; + hdr = NULL; /* default LLC/SNAP for IP */ + } + if (!tb[TCA_ATM_EXCESS-1]) excess = NULL; + else { + if (RTA_PAYLOAD(tb[TCA_ATM_EXCESS-1]) != sizeof(u32)) + return -EINVAL; + excess = (struct atm_flow_data *) atm_tc_get(sch, + *(u32 *) RTA_DATA(tb[TCA_ATM_EXCESS-1])); + if (!excess) return -ENOENT; + } + DPRINTK("atm_tc_change: type %d, payload %d, hdr_len %d\n", + opt->rta_type,RTA_PAYLOAD(opt),hdr_len); + if (!(sock = sockfd_lookup(fd,&error))) return error; /* f_count++ */ + DPRINTK("atm_tc_change: f_count %d\n",file_count(sock->file)); + if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) { + error = -EPROTOTYPE; + goto err_out; + } + /* @@@ should check if the socket is really operational or we'll crash + on vcc->send */ + if (classid) { + if (TC_H_MAJ(classid ^ sch->handle)) { + DPRINTK("atm_tc_change: classid mismatch\n"); + error = -EINVAL; + goto err_out; + } + if (find_flow(p,flow)) { + error = -EEXIST; + goto err_out; + } + } + else { + int i; + unsigned long cl; + + for (i = 1; i < 0x8000; i++) { + classid = TC_H_MAKE(sch->handle,0x8000 | i); + if (!(cl = atm_tc_get(sch,classid))) break; + atm_tc_put(sch,cl); + } + } + DPRINTK("atm_tc_change: new id %x\n",classid); + flow = kmalloc(sizeof(struct atm_flow_data)+hdr_len,GFP_KERNEL); + DPRINTK("atm_tc_change: flow %p\n",flow); + if (!flow) { + error = -ENOBUFS; + goto err_out; + } + memset(flow,0,sizeof(*flow)); + flow->filter_list = NULL; + if (!(flow->q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops))) + flow->q = &noop_qdisc; + DPRINTK("atm_tc_change: qdisc %p\n",flow->q); + flow->sock = sock; + flow->vcc = ATM_SD(sock); /* speedup */ + flow->vcc->user_back = flow; + DPRINTK("atm_tc_change: vcc %p\n",flow->vcc); + flow->old_pop = flow->vcc->pop; + flow->parent = p; + flow->vcc->pop = sch_atm_pop; + flow->classid = classid; + flow->ref = 1; + flow->excess = excess; + flow->next = p->link.next; + p->link.next = flow; + flow->hdr_len = hdr_len; + if (hdr) + memcpy(flow->hdr,hdr,hdr_len); + else + memcpy(flow->hdr,llc_oui_ip,sizeof(llc_oui_ip)); + *arg = (unsigned long) flow; + return 0; +err_out: + if (excess) atm_tc_put(sch,(unsigned long) excess); + sockfd_put(sock); + return error; +} + + +static int atm_tc_delete(struct Qdisc *sch,unsigned long arg) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) arg; + + DPRINTK("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); + if (!find_flow(PRIV(sch),flow)) return -EINVAL; + if (flow->filter_list || flow == &p->link) return -EBUSY; + /* + * Reference count must be 2: one for "keepalive" (set at class + * creation), and one for the reference held when calling delete. + */ + if (flow->ref < 2) { + printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n",flow->ref); + return -EINVAL; + } + if (flow->ref > 2) return -EBUSY; /* catch references via excess, etc.*/ + atm_tc_put(sch,arg); + return 0; +} + + +static void atm_tc_walk(struct Qdisc *sch,struct qdisc_walker *walker) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + + DPRINTK("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker); + if (walker->stop) return; + for (flow = p->flows; flow; flow = flow->next) { + if (walker->count >= walker->skip) + if (walker->fn(sch,(unsigned long) flow,walker) < 0) { + walker->stop = 1; + break; + } + walker->count++; + } +} + + +static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch,unsigned long cl) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) cl; + + DPRINTK("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); + return flow ? &flow->filter_list : &p->link.filter_list; +} + + +/* --------------------------- Qdisc operations ---------------------------- */ + + +static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = NULL ; /* @@@ */ + struct tcf_result res; + int result; + int ret = NET_XMIT_POLICED; + + D2PRINTK("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); + result = TC_POLICE_OK; /* be nice to gcc */ + if (TC_H_MAJ(skb->priority) != sch->handle || + !(flow = (struct atm_flow_data *) atm_tc_get(sch,skb->priority))) + for (flow = p->flows; flow; flow = flow->next) + if (flow->filter_list) { + result = tc_classify(skb,flow->filter_list, + &res); + if (result < 0) continue; + flow = (struct atm_flow_data *) res.class; + if (!flow) flow = lookup_flow(sch,res.classid); + break; + } + if (!flow) flow = &p->link; + else { + if (flow->vcc) + ATM_SKB(skb)->atm_options = flow->vcc->atm_options; + /*@@@ looks good ... but it's not supposed to work :-)*/ +#ifdef CONFIG_NET_CLS_POLICE + switch (result) { + case TC_POLICE_SHOT: + kfree_skb(skb); + break; + case TC_POLICE_RECLASSIFY: + if (flow->excess) flow = flow->excess; + else { + ATM_SKB(skb)->atm_options |= + ATM_ATMOPT_CLP; + break; + } + /* fall through */ + case TC_POLICE_OK: + /* fall through */ + default: + break; + } +#endif + } + if ( +#ifdef CONFIG_NET_CLS_POLICE + result == TC_POLICE_SHOT || +#endif + (ret = flow->q->enqueue(skb,flow->q)) != 0) { + sch->qstats.drops++; + if (flow) flow->qstats.drops++; + return ret; + } + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + flow->bstats.bytes += skb->len; + flow->bstats.packets++; + /* + * Okay, this may seem weird. We pretend we've dropped the packet if + * it goes via ATM. The reason for this is that the outer qdisc + * expects to be able to q->dequeue the packet later on if we return + * success at this place. Also, sch->q.qdisc needs to reflect whether + * there is a packet egligible for dequeuing or not. Note that the + * statistics of the outer qdisc are necessarily wrong because of all + * this. There's currently no correct solution for this. + */ + if (flow == &p->link) { + sch->q.qlen++; + return 0; + } + tasklet_schedule(&p->task); + return NET_XMIT_BYPASS; +} + + +/* + * Dequeue packets and send them over ATM. Note that we quite deliberately + * avoid checking net_device's flow control here, simply because sch_atm + * uses its own channels, which have nothing to do with any CLIP/LANE/or + * non-ATM interfaces. + */ + + +static void sch_atm_dequeue(unsigned long data) +{ + struct Qdisc *sch = (struct Qdisc *) data; + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + struct sk_buff *skb; + + D2PRINTK("sch_atm_dequeue(sch %p,[qdisc %p])\n",sch,p); + for (flow = p->link.next; flow; flow = flow->next) + /* + * If traffic is properly shaped, this won't generate nasty + * little bursts. Otherwise, it may ... (but that's okay) + */ + while ((skb = flow->q->dequeue(flow->q))) { + if (!atm_may_send(flow->vcc,skb->truesize)) { + (void) flow->q->ops->requeue(skb,flow->q); + break; + } + D2PRINTK("atm_tc_dequeue: sending on class %p\n",flow); + /* remove any LL header somebody else has attached */ + skb_pull(skb,(char *) skb->nh.iph-(char *) skb->data); + if (skb_headroom(skb) < flow->hdr_len) { + struct sk_buff *new; + + new = skb_realloc_headroom(skb,flow->hdr_len); + dev_kfree_skb(skb); + if (!new) continue; + skb = new; + } + D2PRINTK("sch_atm_dequeue: ip %p, data %p\n", + skb->nh.iph,skb->data); + ATM_SKB(skb)->vcc = flow->vcc; + memcpy(skb_push(skb,flow->hdr_len),flow->hdr, + flow->hdr_len); + atomic_add(skb->truesize, + &sk_atm(flow->vcc)->sk_wmem_alloc); + /* atm.atm_options are already set by atm_tc_enqueue */ + (void) flow->vcc->send(flow->vcc,skb); + } +} + + +static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct sk_buff *skb; + + D2PRINTK("atm_tc_dequeue(sch %p,[qdisc %p])\n",sch,p); + tasklet_schedule(&p->task); + skb = p->link.q->dequeue(p->link.q); + if (skb) sch->q.qlen--; + return skb; +} + + +static int atm_tc_requeue(struct sk_buff *skb,struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + int ret; + + D2PRINTK("atm_tc_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); + ret = p->link.q->ops->requeue(skb,p->link.q); + if (!ret) { + sch->q.qlen++; + sch->qstats.requeues++; + } else { + sch->qstats.drops++; + p->link.qstats.drops++; + } + return ret; +} + + +static unsigned int atm_tc_drop(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + unsigned int len; + + DPRINTK("atm_tc_drop(sch %p,[qdisc %p])\n",sch,p); + for (flow = p->flows; flow; flow = flow->next) + if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q))) + return len; + return 0; +} + + +static int atm_tc_init(struct Qdisc *sch,struct rtattr *opt) +{ + struct atm_qdisc_data *p = PRIV(sch); + + DPRINTK("atm_tc_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); + p->flows = &p->link; + if(!(p->link.q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops))) + p->link.q = &noop_qdisc; + DPRINTK("atm_tc_init: link (%p) qdisc %p\n",&p->link,p->link.q); + p->link.filter_list = NULL; + p->link.vcc = NULL; + p->link.sock = NULL; + p->link.classid = sch->handle; + p->link.ref = 1; + p->link.next = NULL; + tasklet_init(&p->task,sch_atm_dequeue,(unsigned long) sch); + return 0; +} + + +static void atm_tc_reset(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + + DPRINTK("atm_tc_reset(sch %p,[qdisc %p])\n",sch,p); + for (flow = p->flows; flow; flow = flow->next) qdisc_reset(flow->q); + sch->q.qlen = 0; +} + + +static void atm_tc_destroy(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + + DPRINTK("atm_tc_destroy(sch %p,[qdisc %p])\n",sch,p); + /* races ? */ + while ((flow = p->flows)) { + destroy_filters(flow); + if (flow->ref > 1) + printk(KERN_ERR "atm_destroy: %p->ref = %d\n",flow, + flow->ref); + atm_tc_put(sch,(unsigned long) flow); + if (p->flows == flow) { + printk(KERN_ERR "atm_destroy: putting flow %p didn't " + "kill it\n",flow); + p->flows = flow->next; /* brute force */ + break; + } + } + tasklet_kill(&p->task); +} + + +static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) cl; + unsigned char *b = skb->tail; + struct rtattr *rta; + + DPRINTK("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", + sch,p,flow,skb,tcm); + if (!find_flow(p,flow)) return -EINVAL; + tcm->tcm_handle = flow->classid; + rta = (struct rtattr *) b; + RTA_PUT(skb,TCA_OPTIONS,0,NULL); + RTA_PUT(skb,TCA_ATM_HDR,flow->hdr_len,flow->hdr); + if (flow->vcc) { + struct sockaddr_atmpvc pvc; + int state; + + pvc.sap_family = AF_ATMPVC; + pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; + pvc.sap_addr.vpi = flow->vcc->vpi; + pvc.sap_addr.vci = flow->vcc->vci; + RTA_PUT(skb,TCA_ATM_ADDR,sizeof(pvc),&pvc); + state = ATM_VF2VS(flow->vcc->flags); + RTA_PUT(skb,TCA_ATM_STATE,sizeof(state),&state); + } + if (flow->excess) + RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(u32),&flow->classid); + else { + static u32 zero; + + RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(zero),&zero); + } + rta->rta_len = skb->tail-b; + return skb->len; + +rtattr_failure: + skb_trim(skb,b-skb->data); + return -1; +} +static int +atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct atm_flow_data *flow = (struct atm_flow_data *) arg; + + flow->qstats.qlen = flow->q->q.qlen; + + if (gnet_stats_copy_basic(d, &flow->bstats) < 0 || + gnet_stats_copy_queue(d, &flow->qstats) < 0) + return -1; + + return 0; +} + +static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + return 0; +} + +static struct Qdisc_class_ops atm_class_ops = { + .graft = atm_tc_graft, + .leaf = atm_tc_leaf, + .get = atm_tc_get, + .put = atm_tc_put, + .change = atm_tc_change, + .delete = atm_tc_delete, + .walk = atm_tc_walk, + .tcf_chain = atm_tc_find_tcf, + .bind_tcf = atm_tc_bind_filter, + .unbind_tcf = atm_tc_put, + .dump = atm_tc_dump_class, + .dump_stats = atm_tc_dump_class_stats, +}; + +static struct Qdisc_ops atm_qdisc_ops = { + .next = NULL, + .cl_ops = &atm_class_ops, + .id = "atm", + .priv_size = sizeof(struct atm_qdisc_data), + .enqueue = atm_tc_enqueue, + .dequeue = atm_tc_dequeue, + .requeue = atm_tc_requeue, + .drop = atm_tc_drop, + .init = atm_tc_init, + .reset = atm_tc_reset, + .destroy = atm_tc_destroy, + .change = NULL, + .dump = atm_tc_dump, + .owner = THIS_MODULE, +}; + + +static int __init atm_init(void) +{ + return register_qdisc(&atm_qdisc_ops); +} + +static void __exit atm_exit(void) +{ + unregister_qdisc(&atm_qdisc_ops); +} + +module_init(atm_init) +module_exit(atm_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c new file mode 100644 index 00000000000..d43e3b8cbf6 --- /dev/null +++ b/net/sched/sch_cbq.c @@ -0,0 +1,2124 @@ +/* + * net/sched/sch_cbq.c Class-Based Queueing discipline. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +/* Class-Based Queueing (CBQ) algorithm. + ======================================= + + Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource + Management Models for Packet Networks", + IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 + + [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 + + [3] Sally Floyd, "Notes on Class-Based Queueing: Setting + Parameters", 1996 + + [4] Sally Floyd and Michael Speer, "Experimental Results + for Class-Based Queueing", 1998, not published. + + ----------------------------------------------------------------------- + + Algorithm skeleton was taken from NS simulator cbq.cc. + If someone wants to check this code against the LBL version, + he should take into account that ONLY the skeleton was borrowed, + the implementation is different. Particularly: + + --- The WRR algorithm is different. Our version looks more + reasonable (I hope) and works when quanta are allowed to be + less than MTU, which is always the case when real time classes + have small rates. Note, that the statement of [3] is + incomplete, delay may actually be estimated even if class + per-round allotment is less than MTU. Namely, if per-round + allotment is W*r_i, and r_1+...+r_k = r < 1 + + delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B + + In the worst case we have IntServ estimate with D = W*r+k*MTU + and C = MTU*r. The proof (if correct at all) is trivial. + + + --- It seems that cbq-2.0 is not very accurate. At least, I cannot + interpret some places, which look like wrong translations + from NS. Anyone is advised to find these differences + and explain to me, why I am wrong 8). + + --- Linux has no EOI event, so that we cannot estimate true class + idle time. Workaround is to consider the next dequeue event + as sign that previous packet is finished. This is wrong because of + internal device queueing, but on a permanently loaded link it is true. + Moreover, combined with clock integrator, this scheme looks + very close to an ideal solution. */ + +struct cbq_sched_data; + + +struct cbq_class +{ + struct cbq_class *next; /* hash table link */ + struct cbq_class *next_alive; /* next class with backlog in this priority band */ + +/* Parameters */ + u32 classid; + unsigned char priority; /* class priority */ + unsigned char priority2; /* priority to be used after overlimit */ + unsigned char ewma_log; /* time constant for idle time calculation */ + unsigned char ovl_strategy; +#ifdef CONFIG_NET_CLS_POLICE + unsigned char police; +#endif + + u32 defmap; + + /* Link-sharing scheduler parameters */ + long maxidle; /* Class parameters: see below. */ + long offtime; + long minidle; + u32 avpkt; + struct qdisc_rate_table *R_tab; + + /* Overlimit strategy parameters */ + void (*overlimit)(struct cbq_class *cl); + long penalty; + + /* General scheduler (WRR) parameters */ + long allot; + long quantum; /* Allotment per WRR round */ + long weight; /* Relative allotment: see below */ + + struct Qdisc *qdisc; /* Ptr to CBQ discipline */ + struct cbq_class *split; /* Ptr to split node */ + struct cbq_class *share; /* Ptr to LS parent in the class tree */ + struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ + struct cbq_class *borrow; /* NULL if class is bandwidth limited; + parent otherwise */ + struct cbq_class *sibling; /* Sibling chain */ + struct cbq_class *children; /* Pointer to children chain */ + + struct Qdisc *q; /* Elementary queueing discipline */ + + +/* Variables */ + unsigned char cpriority; /* Effective priority */ + unsigned char delayed; + unsigned char level; /* level of the class in hierarchy: + 0 for leaf classes, and maximal + level of children + 1 for nodes. + */ + + psched_time_t last; /* Last end of service */ + psched_time_t undertime; + long avgidle; + long deficit; /* Saved deficit for WRR */ + unsigned long penalized; + struct gnet_stats_basic bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est rate_est; + spinlock_t *stats_lock; + struct tc_cbq_xstats xstats; + + struct tcf_proto *filter_list; + + int refcnt; + int filters; + + struct cbq_class *defaults[TC_PRIO_MAX+1]; +}; + +struct cbq_sched_data +{ + struct cbq_class *classes[16]; /* Hash table of all classes */ + int nclasses[TC_CBQ_MAXPRIO+1]; + unsigned quanta[TC_CBQ_MAXPRIO+1]; + + struct cbq_class link; + + unsigned activemask; + struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes + with backlog */ + +#ifdef CONFIG_NET_CLS_POLICE + struct cbq_class *rx_class; +#endif + struct cbq_class *tx_class; + struct cbq_class *tx_borrowed; + int tx_len; + psched_time_t now; /* Cached timestamp */ + psched_time_t now_rt; /* Cached real time */ + unsigned pmask; + + struct timer_list delay_timer; + struct timer_list wd_timer; /* Watchdog timer, + started when CBQ has + backlog, but cannot + transmit just now */ + long wd_expires; + int toplevel; + u32 hgenerator; +}; + + +#define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log]) + + +static __inline__ unsigned cbq_hash(u32 h) +{ + h ^= h>>8; + h ^= h>>4; + return h&0xF; +} + +static __inline__ struct cbq_class * +cbq_class_lookup(struct cbq_sched_data *q, u32 classid) +{ + struct cbq_class *cl; + + for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next) + if (cl->classid == classid) + return cl; + return NULL; +} + +#ifdef CONFIG_NET_CLS_POLICE + +static struct cbq_class * +cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) +{ + struct cbq_class *cl, *new; + + for (cl = this->tparent; cl; cl = cl->tparent) + if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) + return new; + + return NULL; +} + +#endif + +/* Classify packet. The procedure is pretty complicated, but + it allows us to combine link sharing and priority scheduling + transparently. + + Namely, you can put link sharing rules (f.e. route based) at root of CBQ, + so that it resolves to split nodes. Then packets are classified + by logical priority, or a more specific classifier may be attached + to the split node. + */ + +static struct cbq_class * +cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *head = &q->link; + struct cbq_class **defmap; + struct cbq_class *cl = NULL; + u32 prio = skb->priority; + struct tcf_result res; + + /* + * Step 1. If skb->priority points to one of our classes, use it. + */ + if (TC_H_MAJ(prio^sch->handle) == 0 && + (cl = cbq_class_lookup(q, prio)) != NULL) + return cl; + + *qerr = NET_XMIT_DROP; + for (;;) { + int result = 0; + defmap = head->defaults; + + /* + * Step 2+n. Apply classifier. + */ + if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0) + goto fallback; + + if ((cl = (void*)res.class) == NULL) { + if (TC_H_MAJ(res.classid)) + cl = cbq_class_lookup(q, res.classid); + else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) + cl = defmap[TC_PRIO_BESTEFFORT]; + + if (cl == NULL || cl->level >= head->level) + goto fallback; + } + +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS; + case TC_ACT_SHOT: + return NULL; + } +#elif defined(CONFIG_NET_CLS_POLICE) + switch (result) { + case TC_POLICE_RECLASSIFY: + return cbq_reclassify(skb, cl); + case TC_POLICE_SHOT: + return NULL; + default: + break; + } +#endif + if (cl->level == 0) + return cl; + + /* + * Step 3+n. If classifier selected a link sharing class, + * apply agency specific classifier. + * Repeat this procdure until we hit a leaf node. + */ + head = cl; + } + +fallback: + cl = head; + + /* + * Step 4. No success... + */ + if (TC_H_MAJ(prio) == 0 && + !(cl = head->defaults[prio&TC_PRIO_MAX]) && + !(cl = head->defaults[TC_PRIO_BESTEFFORT])) + return head; + + return cl; +} + +/* + A packet has just been enqueued on the empty class. + cbq_activate_class adds it to the tail of active class list + of its priority band. + */ + +static __inline__ void cbq_activate_class(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + int prio = cl->cpriority; + struct cbq_class *cl_tail; + + cl_tail = q->active[prio]; + q->active[prio] = cl; + + if (cl_tail != NULL) { + cl->next_alive = cl_tail->next_alive; + cl_tail->next_alive = cl; + } else { + cl->next_alive = cl; + q->activemask |= (1<<prio); + } +} + +/* + Unlink class from active chain. + Note that this same procedure is done directly in cbq_dequeue* + during round-robin procedure. + */ + +static void cbq_deactivate_class(struct cbq_class *this) +{ + struct cbq_sched_data *q = qdisc_priv(this->qdisc); + int prio = this->cpriority; + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + + do { + cl = cl_prev->next_alive; + if (cl == this) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + q->activemask &= ~(1<<prio); + return; + } + } + + cl = cl_prev->next_alive; + return; + } + } while ((cl_prev = cl) != q->active[prio]); +} + +static void +cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) +{ + int toplevel = q->toplevel; + + if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { + psched_time_t now; + psched_tdiff_t incr; + + PSCHED_GET_TIME(now); + incr = PSCHED_TDIFF(now, q->now_rt); + PSCHED_TADD2(q->now, incr, now); + + do { + if (PSCHED_TLESS(cl->undertime, now)) { + q->toplevel = cl->level; + return; + } + } while ((cl=cl->borrow) != NULL && toplevel > cl->level); + } +} + +static int +cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + int len = skb->len; + int ret; + struct cbq_class *cl = cbq_classify(skb, sch, &ret); + +#ifdef CONFIG_NET_CLS_POLICE + q->rx_class = cl; +#endif + if (cl == NULL) { + if (ret == NET_XMIT_DROP) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } + +#ifdef CONFIG_NET_CLS_POLICE + cl->q->__parent = sch; +#endif + if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) { + sch->q.qlen++; + sch->bstats.packets++; + sch->bstats.bytes+=len; + cbq_mark_toplevel(q, cl); + if (!cl->next_alive) + cbq_activate_class(cl); + return ret; + } + + sch->qstats.drops++; + cbq_mark_toplevel(q, cl); + cl->qstats.drops++; + return ret; +} + +static int +cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl; + int ret; + + if ((cl = q->tx_class) == NULL) { + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_CN; + } + q->tx_class = NULL; + + cbq_mark_toplevel(q, cl); + +#ifdef CONFIG_NET_CLS_POLICE + q->rx_class = cl; + cl->q->__parent = sch; +#endif + if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) { + sch->q.qlen++; + sch->qstats.requeues++; + if (!cl->next_alive) + cbq_activate_class(cl); + return 0; + } + sch->qstats.drops++; + cl->qstats.drops++; + return ret; +} + +/* Overlimit actions */ + +/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ + +static void cbq_ovl_classic(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); + + if (!cl->delayed) { + delay += cl->offtime; + + /* + Class goes to sleep, so that it will have no + chance to work avgidle. Let's forgive it 8) + + BTW cbq-2.0 has a crap in this + place, apparently they forgot to shift it by cl->ewma_log. + */ + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + if (delay <= 0) + delay = 1; + PSCHED_TADD2(q->now, delay, cl->undertime); + + cl->xstats.overactions++; + cl->delayed = 1; + } + if (q->wd_expires == 0 || q->wd_expires > delay) + q->wd_expires = delay; + + /* Dirty work! We must schedule wakeups based on + real available rate, rather than leaf rate, + which may be tiny (even zero). + */ + if (q->toplevel == TC_CBQ_MAXLEVEL) { + struct cbq_class *b; + psched_tdiff_t base_delay = q->wd_expires; + + for (b = cl->borrow; b; b = b->borrow) { + delay = PSCHED_TDIFF(b->undertime, q->now); + if (delay < base_delay) { + if (delay <= 0) + delay = 1; + base_delay = delay; + } + } + + q->wd_expires = base_delay; + } +} + +/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when + they go overlimit + */ + +static void cbq_ovl_rclassic(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + struct cbq_class *this = cl; + + do { + if (cl->level > q->toplevel) { + cl = NULL; + break; + } + } while ((cl = cl->borrow) != NULL); + + if (cl == NULL) + cl = this; + cbq_ovl_classic(cl); +} + +/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ + +static void cbq_ovl_delay(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); + + if (!cl->delayed) { + unsigned long sched = jiffies; + + delay += cl->offtime; + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + PSCHED_TADD2(q->now, delay, cl->undertime); + + if (delay > 0) { + sched += PSCHED_US2JIFFIE(delay) + cl->penalty; + cl->penalized = sched; + cl->cpriority = TC_CBQ_MAXPRIO; + q->pmask |= (1<<TC_CBQ_MAXPRIO); + if (del_timer(&q->delay_timer) && + (long)(q->delay_timer.expires - sched) > 0) + q->delay_timer.expires = sched; + add_timer(&q->delay_timer); + cl->delayed = 1; + cl->xstats.overactions++; + return; + } + delay = 1; + } + if (q->wd_expires == 0 || q->wd_expires > delay) + q->wd_expires = delay; +} + +/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ + +static void cbq_ovl_lowprio(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + + cl->penalized = jiffies + cl->penalty; + + if (cl->cpriority != cl->priority2) { + cl->cpriority = cl->priority2; + q->pmask |= (1<<cl->cpriority); + cl->xstats.overactions++; + } + cbq_ovl_classic(cl); +} + +/* TC_CBQ_OVL_DROP: penalize class by dropping */ + +static void cbq_ovl_drop(struct cbq_class *cl) +{ + if (cl->q->ops->drop) + if (cl->q->ops->drop(cl->q)) + cl->qdisc->q.qlen--; + cl->xstats.overactions++; + cbq_ovl_classic(cl); +} + +static void cbq_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + +static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio) +{ + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + unsigned long now = jiffies; + unsigned long sched = now; + + if (cl_prev == NULL) + return now; + + do { + cl = cl_prev->next_alive; + if ((long)(now - cl->penalized) > 0) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + cl->cpriority = cl->priority; + cl->delayed = 0; + cbq_activate_class(cl); + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + return 0; + } + } + + cl = cl_prev->next_alive; + } else if ((long)(sched - cl->penalized) > 0) + sched = cl->penalized; + } while ((cl_prev = cl) != q->active[prio]); + + return (long)(sched - now); +} + +static void cbq_undelay(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct cbq_sched_data *q = qdisc_priv(sch); + long delay = 0; + unsigned pmask; + + pmask = q->pmask; + q->pmask = 0; + + while (pmask) { + int prio = ffz(~pmask); + long tmp; + + pmask &= ~(1<<prio); + + tmp = cbq_undelay_prio(q, prio); + if (tmp > 0) { + q->pmask |= 1<<prio; + if (tmp < delay || delay == 0) + delay = tmp; + } + } + + if (delay) { + q->delay_timer.expires = jiffies + delay; + add_timer(&q->delay_timer); + } + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + + +#ifdef CONFIG_NET_CLS_POLICE + +static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) +{ + int len = skb->len; + struct Qdisc *sch = child->__parent; + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = q->rx_class; + + q->rx_class = NULL; + + if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { + + cbq_mark_toplevel(q, cl); + + q->rx_class = cl; + cl->q->__parent = sch; + + if (cl->q->enqueue(skb, cl->q) == 0) { + sch->q.qlen++; + sch->bstats.packets++; + sch->bstats.bytes+=len; + if (!cl->next_alive) + cbq_activate_class(cl); + return 0; + } + sch->qstats.drops++; + return 0; + } + + sch->qstats.drops++; + return -1; +} +#endif + +/* + It is mission critical procedure. + + We "regenerate" toplevel cutoff, if transmitting class + has backlog and it is not regulated. It is not part of + original CBQ description, but looks more reasonable. + Probably, it is wrong. This question needs further investigation. +*/ + +static __inline__ void +cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, + struct cbq_class *borrowed) +{ + if (cl && q->toplevel >= borrowed->level) { + if (cl->q->q.qlen > 1) { + do { + if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) { + q->toplevel = borrowed->level; + return; + } + } while ((borrowed=borrowed->borrow) != NULL); + } +#if 0 + /* It is not necessary now. Uncommenting it + will save CPU cycles, but decrease fairness. + */ + q->toplevel = TC_CBQ_MAXLEVEL; +#endif + } +} + +static void +cbq_update(struct cbq_sched_data *q) +{ + struct cbq_class *this = q->tx_class; + struct cbq_class *cl = this; + int len = q->tx_len; + + q->tx_class = NULL; + + for ( ; cl; cl = cl->share) { + long avgidle = cl->avgidle; + long idle; + + cl->bstats.packets++; + cl->bstats.bytes += len; + + /* + (now - last) is total time between packet right edges. + (last_pktlen/rate) is "virtual" busy time, so that + + idle = (now - last) - last_pktlen/rate + */ + + idle = PSCHED_TDIFF(q->now, cl->last); + if ((unsigned long)idle > 128*1024*1024) { + avgidle = cl->maxidle; + } else { + idle -= L2T(cl, len); + + /* true_avgidle := (1-W)*true_avgidle + W*idle, + where W=2^{-ewma_log}. But cl->avgidle is scaled: + cl->avgidle == true_avgidle/W, + hence: + */ + avgidle += idle - (avgidle>>cl->ewma_log); + } + + if (avgidle <= 0) { + /* Overlimit or at-limit */ + + if (avgidle < cl->minidle) + avgidle = cl->minidle; + + cl->avgidle = avgidle; + + /* Calculate expected time, when this class + will be allowed to send. + It will occur, when: + (1-W)*true_avgidle + W*delay = 0, i.e. + idle = (1/W - 1)*(-true_avgidle) + or + idle = (1 - W)*(-cl->avgidle); + */ + idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); + + /* + That is not all. + To maintain the rate allocated to the class, + we add to undertime virtual clock, + necessary to complete transmitted packet. + (len/phys_bandwidth has been already passed + to the moment of cbq_update) + */ + + idle -= L2T(&q->link, len); + idle += L2T(cl, len); + + PSCHED_AUDIT_TDIFF(idle); + + PSCHED_TADD2(q->now, idle, cl->undertime); + } else { + /* Underlimit */ + + PSCHED_SET_PASTPERFECT(cl->undertime); + if (avgidle > cl->maxidle) + cl->avgidle = cl->maxidle; + else + cl->avgidle = avgidle; + } + cl->last = q->now; + } + + cbq_update_toplevel(q, this, q->tx_borrowed); +} + +static __inline__ struct cbq_class * +cbq_under_limit(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + struct cbq_class *this_cl = cl; + + if (cl->tparent == NULL) + return cl; + + if (PSCHED_IS_PASTPERFECT(cl->undertime) || + !PSCHED_TLESS(q->now, cl->undertime)) { + cl->delayed = 0; + return cl; + } + + do { + /* It is very suspicious place. Now overlimit + action is generated for not bounded classes + only if link is completely congested. + Though it is in agree with ancestor-only paradigm, + it looks very stupid. Particularly, + it means that this chunk of code will either + never be called or result in strong amplification + of burstiness. Dangerous, silly, and, however, + no another solution exists. + */ + if ((cl = cl->borrow) == NULL) { + this_cl->qstats.overlimits++; + this_cl->overlimit(this_cl); + return NULL; + } + if (cl->level > q->toplevel) + return NULL; + } while (!PSCHED_IS_PASTPERFECT(cl->undertime) && + PSCHED_TLESS(q->now, cl->undertime)); + + cl->delayed = 0; + return cl; +} + +static __inline__ struct sk_buff * +cbq_dequeue_prio(struct Qdisc *sch, int prio) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl_tail, *cl_prev, *cl; + struct sk_buff *skb; + int deficit; + + cl_tail = cl_prev = q->active[prio]; + cl = cl_prev->next_alive; + + do { + deficit = 0; + + /* Start round */ + do { + struct cbq_class *borrow = cl; + + if (cl->q->q.qlen && + (borrow = cbq_under_limit(cl)) == NULL) + goto skip_class; + + if (cl->deficit <= 0) { + /* Class exhausted its allotment per + this round. Switch to the next one. + */ + deficit = 1; + cl->deficit += cl->quantum; + goto next_class; + } + + skb = cl->q->dequeue(cl->q); + + /* Class did not give us any skb :-( + It could occur even if cl->q->q.qlen != 0 + f.e. if cl->q == "tbf" + */ + if (skb == NULL) + goto skip_class; + + cl->deficit -= skb->len; + q->tx_class = cl; + q->tx_borrowed = borrow; + if (borrow != cl) { +#ifndef CBQ_XSTATS_BORROWS_BYTES + borrow->xstats.borrows++; + cl->xstats.borrows++; +#else + borrow->xstats.borrows += skb->len; + cl->xstats.borrows += skb->len; +#endif + } + q->tx_len = skb->len; + + if (cl->deficit <= 0) { + q->active[prio] = cl; + cl = cl->next_alive; + cl->deficit += cl->quantum; + } + return skb; + +skip_class: + if (cl->q->q.qlen == 0 || prio != cl->cpriority) { + /* Class is empty or penalized. + Unlink it from active chain. + */ + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + + /* Did cl_tail point to it? */ + if (cl == cl_tail) { + /* Repair it! */ + cl_tail = cl_prev; + + /* Was it the last class in this band? */ + if (cl == cl_tail) { + /* Kill the band! */ + q->active[prio] = NULL; + q->activemask &= ~(1<<prio); + if (cl->q->q.qlen) + cbq_activate_class(cl); + return NULL; + } + + q->active[prio] = cl_tail; + } + if (cl->q->q.qlen) + cbq_activate_class(cl); + + cl = cl_prev; + } + +next_class: + cl_prev = cl; + cl = cl->next_alive; + } while (cl_prev != cl_tail); + } while (deficit); + + q->active[prio] = cl_prev; + + return NULL; +} + +static __inline__ struct sk_buff * +cbq_dequeue_1(struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + unsigned activemask; + + activemask = q->activemask&0xFF; + while (activemask) { + int prio = ffz(~activemask); + activemask &= ~(1<<prio); + skb = cbq_dequeue_prio(sch, prio); + if (skb) + return skb; + } + return NULL; +} + +static struct sk_buff * +cbq_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + struct cbq_sched_data *q = qdisc_priv(sch); + psched_time_t now; + psched_tdiff_t incr; + + PSCHED_GET_TIME(now); + incr = PSCHED_TDIFF(now, q->now_rt); + + if (q->tx_class) { + psched_tdiff_t incr2; + /* Time integrator. We calculate EOS time + by adding expected packet transmission time. + If real time is greater, we warp artificial clock, + so that: + + cbq_time = max(real_time, work); + */ + incr2 = L2T(&q->link, q->tx_len); + PSCHED_TADD(q->now, incr2); + cbq_update(q); + if ((incr -= incr2) < 0) + incr = 0; + } + PSCHED_TADD(q->now, incr); + q->now_rt = now; + + for (;;) { + q->wd_expires = 0; + + skb = cbq_dequeue_1(sch); + if (skb) { + sch->q.qlen--; + sch->flags &= ~TCQ_F_THROTTLED; + return skb; + } + + /* All the classes are overlimit. + + It is possible, if: + + 1. Scheduler is empty. + 2. Toplevel cutoff inhibited borrowing. + 3. Root class is overlimit. + + Reset 2d and 3d conditions and retry. + + Note, that NS and cbq-2.0 are buggy, peeking + an arbitrary class is appropriate for ancestor-only + sharing, but not for toplevel algorithm. + + Our version is better, but slower, because it requires + two passes, but it is unavoidable with top-level sharing. + */ + + if (q->toplevel == TC_CBQ_MAXLEVEL && + PSCHED_IS_PASTPERFECT(q->link.undertime)) + break; + + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_SET_PASTPERFECT(q->link.undertime); + } + + /* No packets in scheduler or nobody wants to give them to us :-( + Sigh... start watchdog timer in the last case. */ + + if (sch->q.qlen) { + sch->qstats.overlimits++; + if (q->wd_expires) { + long delay = PSCHED_US2JIFFIE(q->wd_expires); + if (delay <= 0) + delay = 1; + mod_timer(&q->wd_timer, jiffies + delay); + sch->flags |= TCQ_F_THROTTLED; + } + } + return NULL; +} + +/* CBQ class maintanance routines */ + +static void cbq_adjust_levels(struct cbq_class *this) +{ + if (this == NULL) + return; + + do { + int level = 0; + struct cbq_class *cl; + + if ((cl = this->children) != NULL) { + do { + if (cl->level > level) + level = cl->level; + } while ((cl = cl->sibling) != this->children); + } + this->level = level+1; + } while ((this = this->tparent) != NULL); +} + +static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) +{ + struct cbq_class *cl; + unsigned h; + + if (q->quanta[prio] == 0) + return; + + for (h=0; h<16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { + /* BUGGGG... Beware! This expression suffer of + arithmetic overflows! + */ + if (cl->priority == prio) { + cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ + q->quanta[prio]; + } + if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { + printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum); + cl->quantum = cl->qdisc->dev->mtu/2 + 1; + } + } + } +} + +static void cbq_sync_defmap(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + struct cbq_class *split = cl->split; + unsigned h; + int i; + + if (split == NULL) + return; + + for (i=0; i<=TC_PRIO_MAX; i++) { + if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) + split->defaults[i] = NULL; + } + + for (i=0; i<=TC_PRIO_MAX; i++) { + int level = split->level; + + if (split->defaults[i]) + continue; + + for (h=0; h<16; h++) { + struct cbq_class *c; + + for (c = q->classes[h]; c; c = c->next) { + if (c->split == split && c->level < level && + c->defmap&(1<<i)) { + split->defaults[i] = c; + level = c->level; + } + } + } + } +} + +static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) +{ + struct cbq_class *split = NULL; + + if (splitid == 0) { + if ((split = cl->split) == NULL) + return; + splitid = split->classid; + } + + if (split == NULL || split->classid != splitid) { + for (split = cl->tparent; split; split = split->tparent) + if (split->classid == splitid) + break; + } + + if (split == NULL) + return; + + if (cl->split != split) { + cl->defmap = 0; + cbq_sync_defmap(cl); + cl->split = split; + cl->defmap = def&mask; + } else + cl->defmap = (cl->defmap&~mask)|(def&mask); + + cbq_sync_defmap(cl); +} + +static void cbq_unlink_class(struct cbq_class *this) +{ + struct cbq_class *cl, **clp; + struct cbq_sched_data *q = qdisc_priv(this->qdisc); + + for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) { + if (cl == this) { + *clp = cl->next; + cl->next = NULL; + break; + } + } + + if (this->tparent) { + clp=&this->sibling; + cl = *clp; + do { + if (cl == this) { + *clp = cl->sibling; + break; + } + clp = &cl->sibling; + } while ((cl = *clp) != this->sibling); + + if (this->tparent->children == this) { + this->tparent->children = this->sibling; + if (this->sibling == this) + this->tparent->children = NULL; + } + } else { + BUG_TRAP(this->sibling == this); + } +} + +static void cbq_link_class(struct cbq_class *this) +{ + struct cbq_sched_data *q = qdisc_priv(this->qdisc); + unsigned h = cbq_hash(this->classid); + struct cbq_class *parent = this->tparent; + + this->sibling = this; + this->next = q->classes[h]; + q->classes[h] = this; + + if (parent == NULL) + return; + + if (parent->children == NULL) { + parent->children = this; + } else { + this->sibling = parent->children->sibling; + parent->children->sibling = this; + } +} + +static unsigned int cbq_drop(struct Qdisc* sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl, *cl_head; + int prio; + unsigned int len; + + for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { + if ((cl_head = q->active[prio]) == NULL) + continue; + + cl = cl_head; + do { + if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { + sch->q.qlen--; + return len; + } + } while ((cl = cl->next_alive) != cl_head); + } + return 0; +} + +static void +cbq_reset(struct Qdisc* sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl; + int prio; + unsigned h; + + q->activemask = 0; + q->pmask = 0; + q->tx_class = NULL; + q->tx_borrowed = NULL; + del_timer(&q->wd_timer); + del_timer(&q->delay_timer); + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_GET_TIME(q->now); + q->now_rt = q->now; + + for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) + q->active[prio] = NULL; + + for (h = 0; h < 16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { + qdisc_reset(cl->q); + + cl->next_alive = NULL; + PSCHED_SET_PASTPERFECT(cl->undertime); + cl->avgidle = cl->maxidle; + cl->deficit = cl->quantum; + cl->cpriority = cl->priority; + } + } + sch->q.qlen = 0; +} + + +static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) +{ + if (lss->change&TCF_CBQ_LSS_FLAGS) { + cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; + cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; + } + if (lss->change&TCF_CBQ_LSS_EWMA) + cl->ewma_log = lss->ewma_log; + if (lss->change&TCF_CBQ_LSS_AVPKT) + cl->avpkt = lss->avpkt; + if (lss->change&TCF_CBQ_LSS_MINIDLE) + cl->minidle = -(long)lss->minidle; + if (lss->change&TCF_CBQ_LSS_MAXIDLE) { + cl->maxidle = lss->maxidle; + cl->avgidle = lss->maxidle; + } + if (lss->change&TCF_CBQ_LSS_OFFTIME) + cl->offtime = lss->offtime; + return 0; +} + +static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]--; + q->quanta[cl->priority] -= cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]++; + q->quanta[cl->priority] += cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + + if (wrr->allot) + cl->allot = wrr->allot; + if (wrr->weight) + cl->weight = wrr->weight; + if (wrr->priority) { + cl->priority = wrr->priority-1; + cl->cpriority = cl->priority; + if (cl->priority >= cl->priority2) + cl->priority2 = TC_CBQ_MAXPRIO-1; + } + + cbq_addprio(q, cl); + return 0; +} + +static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) +{ + switch (ovl->strategy) { + case TC_CBQ_OVL_CLASSIC: + cl->overlimit = cbq_ovl_classic; + break; + case TC_CBQ_OVL_DELAY: + cl->overlimit = cbq_ovl_delay; + break; + case TC_CBQ_OVL_LOWPRIO: + if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || + ovl->priority2-1 <= cl->priority) + return -EINVAL; + cl->priority2 = ovl->priority2-1; + cl->overlimit = cbq_ovl_lowprio; + break; + case TC_CBQ_OVL_DROP: + cl->overlimit = cbq_ovl_drop; + break; + case TC_CBQ_OVL_RCLASSIC: + cl->overlimit = cbq_ovl_rclassic; + break; + default: + return -EINVAL; + } + cl->penalty = (ovl->penalty*HZ)/1000; + return 0; +} + +#ifdef CONFIG_NET_CLS_POLICE +static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) +{ + cl->police = p->police; + + if (cl->q->handle) { + if (p->police == TC_POLICE_RECLASSIFY) + cl->q->reshape_fail = cbq_reshape_fail; + else + cl->q->reshape_fail = NULL; + } + return 0; +} +#endif + +static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) +{ + cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); + return 0; +} + +static int cbq_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct rtattr *tb[TCA_CBQ_MAX]; + struct tc_ratespec *r; + + if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 || + tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || + RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) + return -EINVAL; + + if (tb[TCA_CBQ_LSSOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + return -EINVAL; + + r = RTA_DATA(tb[TCA_CBQ_RATE-1]); + + if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) + return -EINVAL; + + q->link.refcnt = 1; + q->link.sibling = &q->link; + q->link.classid = sch->handle; + q->link.qdisc = sch; + if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + q->link.q = &noop_qdisc; + + q->link.priority = TC_CBQ_MAXPRIO-1; + q->link.priority2 = TC_CBQ_MAXPRIO-1; + q->link.cpriority = TC_CBQ_MAXPRIO-1; + q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; + q->link.overlimit = cbq_ovl_classic; + q->link.allot = psched_mtu(sch->dev); + q->link.quantum = q->link.allot; + q->link.weight = q->link.R_tab->rate.rate; + + q->link.ewma_log = TC_CBQ_DEF_EWMA; + q->link.avpkt = q->link.allot/2; + q->link.minidle = -0x7FFFFFFF; + q->link.stats_lock = &sch->dev->queue_lock; + + init_timer(&q->wd_timer); + q->wd_timer.data = (unsigned long)sch; + q->wd_timer.function = cbq_watchdog; + init_timer(&q->delay_timer); + q->delay_timer.data = (unsigned long)sch; + q->delay_timer.function = cbq_undelay; + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_GET_TIME(q->now); + q->now_rt = q->now; + + cbq_link_class(&q->link); + + if (tb[TCA_CBQ_LSSOPT-1]) + cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + + cbq_addprio(q, &q->link); + return 0; +} + +static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + + RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_lssopt opt; + + opt.flags = 0; + if (cl->borrow == NULL) + opt.flags |= TCF_CBQ_LSS_BOUNDED; + if (cl->share == NULL) + opt.flags |= TCF_CBQ_LSS_ISOLATED; + opt.ewma_log = cl->ewma_log; + opt.level = cl->level; + opt.avpkt = cl->avpkt; + opt.maxidle = cl->maxidle; + opt.minidle = (u32)(-cl->minidle); + opt.offtime = cl->offtime; + opt.change = ~0; + RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_wrropt opt; + + opt.flags = 0; + opt.allot = cl->allot; + opt.priority = cl->priority+1; + opt.cpriority = cl->cpriority+1; + opt.weight = cl->weight; + RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_ovl opt; + + opt.strategy = cl->ovl_strategy; + opt.priority2 = cl->priority2+1; + opt.penalty = (cl->penalty*1000)/HZ; + RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_fopt opt; + + if (cl->split || cl->defmap) { + opt.split = cl->split ? cl->split->classid : 0; + opt.defmap = cl->defmap; + opt.defchange = ~0; + RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); + } + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_NET_CLS_POLICE +static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_police opt; + + if (cl->police) { + opt.police = cl->police; + RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); + } + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) +{ + if (cbq_dump_lss(skb, cl) < 0 || + cbq_dump_rate(skb, cl) < 0 || + cbq_dump_wrr(skb, cl) < 0 || + cbq_dump_ovl(skb, cl) < 0 || +#ifdef CONFIG_NET_CLS_POLICE + cbq_dump_police(skb, cl) < 0 || +#endif + cbq_dump_fopt(skb, cl) < 0) + return -1; + return 0; +} + +static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (cbq_dump_attr(skb, &q->link) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + + q->link.xstats.avgidle = q->link.avgidle; + return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats)); +} + +static int +cbq_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (cl->tparent) + tcm->tcm_parent = cl->tparent->classid; + else + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = cl->classid; + tcm->tcm_info = cl->q->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (cbq_dump_attr(skb, cl) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class*)arg; + + cl->qstats.qlen = cl->q->q.qlen; + cl->xstats.avgidle = cl->avgidle; + cl->xstats.undertime = 0; + + if (!PSCHED_IS_PASTPERFECT(cl->undertime)) + cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now); + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || +#ifdef CONFIG_NET_ESTIMATOR + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || +#endif + gnet_stats_copy_queue(d, &cl->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); +} + +static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + if (cl) { + if (new == NULL) { + if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL) + return -ENOBUFS; + } else { +#ifdef CONFIG_NET_CLS_POLICE + if (cl->police == TC_POLICE_RECLASSIFY) + new->reshape_fail = cbq_reshape_fail; +#endif + } + sch_tree_lock(sch); + *old = cl->q; + cl->q = new; + sch->q.qlen -= (*old)->q.qlen; + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; + } + return -ENOENT; +} + +static struct Qdisc * +cbq_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + return cl ? cl->q : NULL; +} + +static unsigned long cbq_get(struct Qdisc *sch, u32 classid) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = cbq_class_lookup(q, classid); + + if (cl) { + cl->refcnt++; + return (unsigned long)cl; + } + return 0; +} + +static void cbq_destroy_filters(struct cbq_class *cl) +{ + struct tcf_proto *tp; + + while ((tp = cl->filter_list) != NULL) { + cl->filter_list = tp->next; + tcf_destroy(tp); + } +} + +static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + + BUG_TRAP(!cl->filters); + + cbq_destroy_filters(cl); + qdisc_destroy(cl->q); + qdisc_put_rtab(cl->R_tab); +#ifdef CONFIG_NET_ESTIMATOR + gen_kill_estimator(&cl->bstats, &cl->rate_est); +#endif + if (cl != &q->link) + kfree(cl); +} + +static void +cbq_destroy(struct Qdisc* sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl; + unsigned h; + +#ifdef CONFIG_NET_CLS_POLICE + q->rx_class = NULL; +#endif + /* + * Filters must be destroyed first because we don't destroy the + * classes from root to leafs which means that filters can still + * be bound to classes which have been destroyed already. --TGR '04 + */ + for (h = 0; h < 16; h++) + for (cl = q->classes[h]; cl; cl = cl->next) + cbq_destroy_filters(cl); + + for (h = 0; h < 16; h++) { + struct cbq_class *next; + + for (cl = q->classes[h]; cl; cl = next) { + next = cl->next; + cbq_destroy_class(sch, cl); + } + } +} + +static void cbq_put(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + if (--cl->refcnt == 0) { +#ifdef CONFIG_NET_CLS_POLICE + struct cbq_sched_data *q = qdisc_priv(sch); + + spin_lock_bh(&sch->dev->queue_lock); + if (q->rx_class == cl) + q->rx_class = NULL; + spin_unlock_bh(&sch->dev->queue_lock); +#endif + + cbq_destroy_class(sch, cl); + } +} + +static int +cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca, + unsigned long *arg) +{ + int err; + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class*)*arg; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_CBQ_MAX]; + struct cbq_class *parent; + struct qdisc_rate_table *rtab = NULL; + + if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt)) + return -EINVAL; + + if (tb[TCA_CBQ_OVL_STRATEGY-1] && + RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl)) + return -EINVAL; + + if (tb[TCA_CBQ_FOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt)) + return -EINVAL; + + if (tb[TCA_CBQ_RATE-1] && + RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) + return -EINVAL; + + if (tb[TCA_CBQ_LSSOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + return -EINVAL; + + if (tb[TCA_CBQ_WRROPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt)) + return -EINVAL; + +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1] && + RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police)) + return -EINVAL; +#endif + + if (cl) { + /* Check parent */ + if (parentid) { + if (cl->tparent && cl->tparent->classid != parentid) + return -EINVAL; + if (!cl->tparent && parentid != TC_H_ROOT) + return -EINVAL; + } + + if (tb[TCA_CBQ_RATE-1]) { + rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (rtab == NULL) + return -EINVAL; + } + + /* Change class parameters */ + sch_tree_lock(sch); + + if (cl->next_alive != NULL) + cbq_deactivate_class(cl); + + if (rtab) { + rtab = xchg(&cl->R_tab, rtab); + qdisc_put_rtab(rtab); + } + + if (tb[TCA_CBQ_LSSOPT-1]) + cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + + if (tb[TCA_CBQ_WRROPT-1]) { + cbq_rmprio(q, cl); + cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + } + + if (tb[TCA_CBQ_OVL_STRATEGY-1]) + cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); + +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1]) + cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); +#endif + + if (tb[TCA_CBQ_FOPT-1]) + cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + + if (cl->q->q.qlen) + cbq_activate_class(cl); + + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_replace_estimator(&cl->bstats, &cl->rate_est, + cl->stats_lock, tca[TCA_RATE-1]); +#endif + return 0; + } + + if (parentid == TC_H_ROOT) + return -EINVAL; + + if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || + tb[TCA_CBQ_LSSOPT-1] == NULL) + return -EINVAL; + + rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (rtab == NULL) + return -EINVAL; + + if (classid) { + err = -EINVAL; + if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) + goto failure; + } else { + int i; + classid = TC_H_MAKE(sch->handle,0x8000); + + for (i=0; i<0x8000; i++) { + if (++q->hgenerator >= 0x8000) + q->hgenerator = 1; + if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) + break; + } + err = -ENOSR; + if (i >= 0x8000) + goto failure; + classid = classid|q->hgenerator; + } + + parent = &q->link; + if (parentid) { + parent = cbq_class_lookup(q, parentid); + err = -EINVAL; + if (parent == NULL) + goto failure; + } + + err = -ENOBUFS; + cl = kmalloc(sizeof(*cl), GFP_KERNEL); + if (cl == NULL) + goto failure; + memset(cl, 0, sizeof(*cl)); + cl->R_tab = rtab; + rtab = NULL; + cl->refcnt = 1; + if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + cl->q = &noop_qdisc; + cl->classid = classid; + cl->tparent = parent; + cl->qdisc = sch; + cl->allot = parent->allot; + cl->quantum = cl->allot; + cl->weight = cl->R_tab->rate.rate; + cl->stats_lock = &sch->dev->queue_lock; + + sch_tree_lock(sch); + cbq_link_class(cl); + cl->borrow = cl->tparent; + if (cl->tparent != &q->link) + cl->share = cl->tparent; + cbq_adjust_levels(parent); + cl->minidle = -0x7FFFFFFF; + cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + if (cl->ewma_log==0) + cl->ewma_log = q->link.ewma_log; + if (cl->maxidle==0) + cl->maxidle = q->link.maxidle; + if (cl->avpkt==0) + cl->avpkt = q->link.avpkt; + cl->overlimit = cbq_ovl_classic; + if (tb[TCA_CBQ_OVL_STRATEGY-1]) + cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1]) + cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); +#endif + if (tb[TCA_CBQ_FOPT-1]) + cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_new_estimator(&cl->bstats, &cl->rate_est, + cl->stats_lock, tca[TCA_RATE-1]); +#endif + + *arg = (unsigned long)cl; + return 0; + +failure: + qdisc_put_rtab(rtab); + return err; +} + +static int cbq_delete(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class*)arg; + + if (cl->filters || cl->children || cl == &q->link) + return -EBUSY; + + sch_tree_lock(sch); + + if (cl->next_alive) + cbq_deactivate_class(cl); + + if (q->tx_borrowed == cl) + q->tx_borrowed = q->tx_class; + if (q->tx_class == cl) { + q->tx_class = NULL; + q->tx_borrowed = NULL; + } +#ifdef CONFIG_NET_CLS_POLICE + if (q->rx_class == cl) + q->rx_class = NULL; +#endif + + cbq_unlink_class(cl); + cbq_adjust_levels(cl->tparent); + cl->defmap = 0; + cbq_sync_defmap(cl); + + cbq_rmprio(q, cl); + sch_tree_unlock(sch); + + if (--cl->refcnt == 0) + cbq_destroy_class(sch, cl); + + return 0; +} + +static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class *)arg; + + if (cl == NULL) + cl = &q->link; + + return &cl->filter_list; +} + +static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *p = (struct cbq_class*)parent; + struct cbq_class *cl = cbq_class_lookup(q, classid); + + if (cl) { + if (p && p->level <= cl->level) + return 0; + cl->filters++; + return (unsigned long)cl; + } + return 0; +} + +static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + cl->filters--; +} + +static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + unsigned h; + + if (arg->stop) + return; + + for (h = 0; h < 16; h++) { + struct cbq_class *cl; + + for (cl = q->classes[h]; cl; cl = cl->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static struct Qdisc_class_ops cbq_class_ops = { + .graft = cbq_graft, + .leaf = cbq_leaf, + .get = cbq_get, + .put = cbq_put, + .change = cbq_change_class, + .delete = cbq_delete, + .walk = cbq_walk, + .tcf_chain = cbq_find_tcf, + .bind_tcf = cbq_bind_filter, + .unbind_tcf = cbq_unbind_filter, + .dump = cbq_dump_class, + .dump_stats = cbq_dump_class_stats, +}; + +static struct Qdisc_ops cbq_qdisc_ops = { + .next = NULL, + .cl_ops = &cbq_class_ops, + .id = "cbq", + .priv_size = sizeof(struct cbq_sched_data), + .enqueue = cbq_enqueue, + .dequeue = cbq_dequeue, + .requeue = cbq_requeue, + .drop = cbq_drop, + .init = cbq_init, + .reset = cbq_reset, + .destroy = cbq_destroy, + .change = NULL, + .dump = cbq_dump, + .dump_stats = cbq_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init cbq_module_init(void) +{ + return register_qdisc(&cbq_qdisc_ops); +} +static void __exit cbq_module_exit(void) +{ + unregister_qdisc(&cbq_qdisc_ops); +} +module_init(cbq_module_init) +module_exit(cbq_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c new file mode 100644 index 00000000000..8a3db9d95ba --- /dev/null +++ b/net/sched/sch_dsmark.c @@ -0,0 +1,479 @@ +/* net/sched/sch_dsmark.c - Differentiated Services field marker */ + +/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ + + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> /* for pkt_sched */ +#include <linux/rtnetlink.h> +#include <net/pkt_sched.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <asm/byteorder.h> + + +#if 1 /* control */ +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +#if 0 /* data */ +#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define D2PRINTK(format,args...) +#endif + + +#define PRIV(sch) qdisc_priv(sch) + + +/* + * classid class marking + * ------- ----- ------- + * n/a 0 n/a + * x:0 1 use entry [0] + * ... ... ... + * x:y y>0 y+1 use entry [y] + * ... ... ... + * x:indices-1 indices use entry [indices-1] + * ... ... ... + * x:y y+1 use entry [y & (indices-1)] + * ... ... ... + * 0xffff 0x10000 use entry [indices-1] + */ + + +#define NO_DEFAULT_INDEX (1 << 16) + +struct dsmark_qdisc_data { + struct Qdisc *q; + struct tcf_proto *filter_list; + __u8 *mask; /* "owns" the array */ + __u8 *value; + __u16 indices; + __u32 default_index; /* index range is 0...0xffff */ + int set_tc_index; +}; + + +/* ------------------------- Class/flow operations ------------------------- */ + + +static int dsmark_graft(struct Qdisc *sch,unsigned long arg, + struct Qdisc *new,struct Qdisc **old) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new, + old); + if (!new) + new = &noop_qdisc; + sch_tree_lock(sch); + *old = xchg(&p->q,new); + if (*old) + qdisc_reset(*old); + sch->q.qlen = 0; + sch_tree_unlock(sch); /* @@@ move up ? */ + return 0; +} + + +static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + return p->q; +} + + +static unsigned long dsmark_get(struct Qdisc *sch,u32 classid) +{ + struct dsmark_qdisc_data *p __attribute__((unused)) = PRIV(sch); + + DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid); + return TC_H_MIN(classid)+1; +} + + +static unsigned long dsmark_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return dsmark_get(sch,classid); +} + + +static void dsmark_put(struct Qdisc *sch, unsigned long cl) +{ +} + + +static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, + struct rtattr **tca, unsigned long *arg) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_DSMARK_MAX]; + + DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," + "arg 0x%lx\n",sch,p,classid,parent,*arg); + if (*arg > p->indices) + return -ENOENT; + if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt)) + return -EINVAL; + if (tb[TCA_DSMARK_MASK-1]) { + if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK-1])) + return -EINVAL; + p->mask[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK-1]); + } + if (tb[TCA_DSMARK_VALUE-1]) { + if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE-1])) + return -EINVAL; + p->value[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE-1]); + } + return 0; +} + + +static int dsmark_delete(struct Qdisc *sch,unsigned long arg) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + if (!arg || arg > p->indices) + return -EINVAL; + p->mask[arg-1] = 0xff; + p->value[arg-1] = 0; + return 0; +} + + +static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + int i; + + DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker); + if (walker->stop) + return; + for (i = 0; i < p->indices; i++) { + if (p->mask[i] == 0xff && !p->value[i]) + continue; + if (walker->count >= walker->skip) { + if (walker->fn(sch, i+1, walker) < 0) { + walker->stop = 1; + break; + } + } + walker->count++; + } +} + + +static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + return &p->filter_list; +} + + +/* --------------------------- Qdisc operations ---------------------------- */ + + +static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct tcf_result res; + int result; + int ret = NET_XMIT_POLICED; + + D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); + if (p->set_tc_index) { + /* FIXME: Safe with non-linear skbs? --RR */ + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + skb->tc_index = ipv4_get_dsfield(skb->nh.iph) + & ~INET_ECN_MASK; + break; + case __constant_htons(ETH_P_IPV6): + skb->tc_index = ipv6_get_dsfield(skb->nh.ipv6h) + & ~INET_ECN_MASK; + break; + default: + skb->tc_index = 0; + break; + }; + } + result = TC_POLICE_OK; /* be nice to gcc */ + if (TC_H_MAJ(skb->priority) == sch->handle) { + skb->tc_index = TC_H_MIN(skb->priority); + } else { + result = tc_classify(skb,p->filter_list,&res); + D2PRINTK("result %d class 0x%04x\n",result,res.classid); + switch (result) { +#ifdef CONFIG_NET_CLS_POLICE + case TC_POLICE_SHOT: + kfree_skb(skb); + break; +#if 0 + case TC_POLICE_RECLASSIFY: + /* FIXME: what to do here ??? */ +#endif +#endif + case TC_POLICE_OK: + skb->tc_index = TC_H_MIN(res.classid); + break; + case TC_POLICE_UNSPEC: + /* fall through */ + default: + if (p->default_index != NO_DEFAULT_INDEX) + skb->tc_index = p->default_index; + break; + }; + } + if ( +#ifdef CONFIG_NET_CLS_POLICE + result == TC_POLICE_SHOT || +#endif + + ((ret = p->q->enqueue(skb,p->q)) != 0)) { + sch->qstats.drops++; + return ret; + } + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + sch->q.qlen++; + return ret; +} + + +static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct sk_buff *skb; + int index; + + D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n",sch,p); + skb = p->q->ops->dequeue(p->q); + if (!skb) + return NULL; + sch->q.qlen--; + index = skb->tc_index & (p->indices-1); + D2PRINTK("index %d->%d\n",skb->tc_index,index); + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + ipv4_change_dsfield(skb->nh.iph, + p->mask[index],p->value[index]); + break; + case __constant_htons(ETH_P_IPV6): + ipv6_change_dsfield(skb->nh.ipv6h, + p->mask[index],p->value[index]); + break; + default: + /* + * Only complain if a change was actually attempted. + * This way, we can send non-IP traffic through dsmark + * and don't need yet another qdisc as a bypass. + */ + if (p->mask[index] != 0xff || p->value[index]) + printk(KERN_WARNING "dsmark_dequeue: " + "unsupported protocol %d\n", + htons(skb->protocol)); + break; + }; + return skb; +} + + +static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch) +{ + int ret; + struct dsmark_qdisc_data *p = PRIV(sch); + + D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); + if ((ret = p->q->ops->requeue(skb, p->q)) == 0) { + sch->q.qlen++; + sch->qstats.requeues++; + return 0; + } + sch->qstats.drops++; + return ret; +} + + +static unsigned int dsmark_drop(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + unsigned int len; + + DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); + if (!p->q->ops->drop) + return 0; + if (!(len = p->q->ops->drop(p->q))) + return 0; + sch->q.qlen--; + return len; +} + + +static int dsmark_init(struct Qdisc *sch,struct rtattr *opt) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct rtattr *tb[TCA_DSMARK_MAX]; + __u16 tmp; + + DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); + if (!opt || + rtattr_parse(tb,TCA_DSMARK_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0 || + !tb[TCA_DSMARK_INDICES-1] || + RTA_PAYLOAD(tb[TCA_DSMARK_INDICES-1]) < sizeof(__u16)) + return -EINVAL; + p->indices = *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES-1]); + if (!p->indices) + return -EINVAL; + for (tmp = p->indices; tmp != 1; tmp >>= 1) { + if (tmp & 1) + return -EINVAL; + } + p->default_index = NO_DEFAULT_INDEX; + if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) { + if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX-1]) < sizeof(__u16)) + return -EINVAL; + p->default_index = + *(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX-1]); + } + p->set_tc_index = !!tb[TCA_DSMARK_SET_TC_INDEX-1]; + p->mask = kmalloc(p->indices*2,GFP_KERNEL); + if (!p->mask) + return -ENOMEM; + p->value = p->mask+p->indices; + memset(p->mask,0xff,p->indices); + memset(p->value,0,p->indices); + if (!(p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + p->q = &noop_qdisc; + DPRINTK("dsmark_init: qdisc %p\n",&p->q); + return 0; +} + + +static void dsmark_reset(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); + qdisc_reset(p->q); + sch->q.qlen = 0; +} + + +static void dsmark_destroy(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct tcf_proto *tp; + + DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n",sch,p); + while (p->filter_list) { + tp = p->filter_list; + p->filter_list = tp->next; + tcf_destroy(tp); + } + qdisc_destroy(p->q); + kfree(p->mask); +} + + +static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + + DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n",sch,p,cl); + if (!cl || cl > p->indices) + return -EINVAL; + tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle),cl-1); + rta = (struct rtattr *) b; + RTA_PUT(skb,TCA_OPTIONS,0,NULL); + RTA_PUT(skb,TCA_DSMARK_MASK,1,&p->mask[cl-1]); + RTA_PUT(skb,TCA_DSMARK_VALUE,1,&p->value[cl-1]); + rta->rta_len = skb->tail-b; + return skb->len; + +rtattr_failure: + skb_trim(skb,b-skb->data); + return -1; +} + +static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + + rta = (struct rtattr *) b; + RTA_PUT(skb,TCA_OPTIONS,0,NULL); + RTA_PUT(skb,TCA_DSMARK_INDICES,sizeof(__u16),&p->indices); + if (p->default_index != NO_DEFAULT_INDEX) { + __u16 tmp = p->default_index; + + RTA_PUT(skb,TCA_DSMARK_DEFAULT_INDEX, sizeof(__u16), &tmp); + } + if (p->set_tc_index) + RTA_PUT(skb, TCA_DSMARK_SET_TC_INDEX, 0, NULL); + rta->rta_len = skb->tail-b; + return skb->len; + +rtattr_failure: + skb_trim(skb,b-skb->data); + return -1; +} + +static struct Qdisc_class_ops dsmark_class_ops = { + .graft = dsmark_graft, + .leaf = dsmark_leaf, + .get = dsmark_get, + .put = dsmark_put, + .change = dsmark_change, + .delete = dsmark_delete, + .walk = dsmark_walk, + .tcf_chain = dsmark_find_tcf, + .bind_tcf = dsmark_bind_filter, + .unbind_tcf = dsmark_put, + .dump = dsmark_dump_class, +}; + +static struct Qdisc_ops dsmark_qdisc_ops = { + .next = NULL, + .cl_ops = &dsmark_class_ops, + .id = "dsmark", + .priv_size = sizeof(struct dsmark_qdisc_data), + .enqueue = dsmark_enqueue, + .dequeue = dsmark_dequeue, + .requeue = dsmark_requeue, + .drop = dsmark_drop, + .init = dsmark_init, + .reset = dsmark_reset, + .destroy = dsmark_destroy, + .change = NULL, + .dump = dsmark_dump, + .owner = THIS_MODULE, +}; + +static int __init dsmark_module_init(void) +{ + return register_qdisc(&dsmark_qdisc_ops); +} +static void __exit dsmark_module_exit(void) +{ + unregister_qdisc(&dsmark_qdisc_ops); +} +module_init(dsmark_module_init) +module_exit(dsmark_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c new file mode 100644 index 00000000000..4888305c96d --- /dev/null +++ b/net/sched/sch_fifo.c @@ -0,0 +1,212 @@ +/* + * net/sched/sch_fifo.c The simplest FIFO queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* 1 band FIFO pseudo-"scheduler" */ + +struct fifo_sched_data +{ + unsigned limit; +}; + +static int +bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + + if (sch->qstats.backlog + skb->len <= q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->qstats.backlog += skb->len; + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return 0; + } + sch->qstats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static int +bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_head(&sch->q, skb); + sch->qstats.backlog += skb->len; + sch->qstats.requeues++; + return 0; +} + +static struct sk_buff * +bfifo_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->q); + if (skb) + sch->qstats.backlog -= skb->len; + return skb; +} + +static unsigned int +fifo_drop(struct Qdisc* sch) +{ + struct sk_buff *skb; + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + unsigned int len = skb->len; + sch->qstats.backlog -= len; + kfree_skb(skb); + return len; + } + return 0; +} + +static void +fifo_reset(struct Qdisc* sch) +{ + skb_queue_purge(&sch->q); + sch->qstats.backlog = 0; +} + +static int +pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + + if (sch->q.qlen < q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return 0; + } + sch->qstats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static int +pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_head(&sch->q, skb); + sch->qstats.requeues++; + return 0; +} + + +static struct sk_buff * +pfifo_dequeue(struct Qdisc* sch) +{ + return __skb_dequeue(&sch->q); +} + +static int fifo_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + + if (opt == NULL) { + unsigned int limit = sch->dev->tx_queue_len ? : 1; + + if (sch->ops == &bfifo_qdisc_ops) + q->limit = limit*sch->dev->mtu; + else + q->limit = limit; + } else { + struct tc_fifo_qopt *ctl = RTA_DATA(opt); + if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + return -EINVAL; + q->limit = ctl->limit; + } + return 0; +} + +static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct tc_fifo_qopt opt; + + opt.limit = q->limit; + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +struct Qdisc_ops pfifo_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "pfifo", + .priv_size = sizeof(struct fifo_sched_data), + .enqueue = pfifo_enqueue, + .dequeue = pfifo_dequeue, + .requeue = pfifo_requeue, + .drop = fifo_drop, + .init = fifo_init, + .reset = fifo_reset, + .destroy = NULL, + .change = fifo_init, + .dump = fifo_dump, + .owner = THIS_MODULE, +}; + +struct Qdisc_ops bfifo_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "bfifo", + .priv_size = sizeof(struct fifo_sched_data), + .enqueue = bfifo_enqueue, + .dequeue = bfifo_dequeue, + .requeue = bfifo_requeue, + .drop = fifo_drop, + .init = fifo_init, + .reset = fifo_reset, + .destroy = NULL, + .change = fifo_init, + .dump = fifo_dump, + .owner = THIS_MODULE, +}; + +EXPORT_SYMBOL(bfifo_qdisc_ops); +EXPORT_SYMBOL(pfifo_qdisc_ops); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c new file mode 100644 index 00000000000..8c01e023f02 --- /dev/null +++ b/net/sched/sch_generic.c @@ -0,0 +1,609 @@ +/* + * net/sched/sch_generic.c Generic packet scheduler routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Jamal Hadi Salim, <hadi@cyberus.ca> 990601 + * - Ingress support + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/rcupdate.h> +#include <linux/list.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* Main transmission queue. */ + +/* Main qdisc structure lock. + + However, modifications + to data, participating in scheduling must be additionally + protected with dev->queue_lock spinlock. + + The idea is the following: + - enqueue, dequeue are serialized via top level device + spinlock dev->queue_lock. + - tree walking is protected by read_lock_bh(qdisc_tree_lock) + and this lock is used only in process context. + - updates to tree are made under rtnl semaphore or + from softirq context (__qdisc_destroy rcu-callback) + hence this lock needs local bh disabling. + + qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! + */ +DEFINE_RWLOCK(qdisc_tree_lock); + +void qdisc_lock_tree(struct net_device *dev) +{ + write_lock_bh(&qdisc_tree_lock); + spin_lock_bh(&dev->queue_lock); +} + +void qdisc_unlock_tree(struct net_device *dev) +{ + spin_unlock_bh(&dev->queue_lock); + write_unlock_bh(&qdisc_tree_lock); +} + +/* + dev->queue_lock serializes queue accesses for this device + AND dev->qdisc pointer itself. + + dev->xmit_lock serializes accesses to device driver. + + dev->queue_lock and dev->xmit_lock are mutually exclusive, + if one is grabbed, another must be free. + */ + + +/* Kick device. + Note, that this procedure can be called by a watchdog timer, so that + we do not check dev->tbusy flag here. + + Returns: 0 - queue is empty. + >0 - queue is not empty, but throttled. + <0 - queue is not empty. Device is throttled, if dev->tbusy != 0. + + NOTE: Called under dev->queue_lock with locally disabled BH. +*/ + +int qdisc_restart(struct net_device *dev) +{ + struct Qdisc *q = dev->qdisc; + struct sk_buff *skb; + + /* Dequeue packet */ + if ((skb = q->dequeue(q)) != NULL) { + unsigned nolock = (dev->features & NETIF_F_LLTX); + /* + * When the driver has LLTX set it does its own locking + * in start_xmit. No need to add additional overhead by + * locking again. These checks are worth it because + * even uncongested locks can be quite expensive. + * The driver can do trylock like here too, in case + * of lock congestion it should return -1 and the packet + * will be requeued. + */ + if (!nolock) { + if (!spin_trylock(&dev->xmit_lock)) { + collision: + /* So, someone grabbed the driver. */ + + /* It may be transient configuration error, + when hard_start_xmit() recurses. We detect + it by checking xmit owner and drop the + packet when deadloop is detected. + */ + if (dev->xmit_lock_owner == smp_processor_id()) { + kfree_skb(skb); + if (net_ratelimit()) + printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); + return -1; + } + __get_cpu_var(netdev_rx_stat).cpu_collision++; + goto requeue; + } + /* Remember that the driver is grabbed by us. */ + dev->xmit_lock_owner = smp_processor_id(); + } + + { + /* And release queue */ + spin_unlock(&dev->queue_lock); + + if (!netif_queue_stopped(dev)) { + int ret; + if (netdev_nit) + dev_queue_xmit_nit(skb, dev); + + ret = dev->hard_start_xmit(skb, dev); + if (ret == NETDEV_TX_OK) { + if (!nolock) { + dev->xmit_lock_owner = -1; + spin_unlock(&dev->xmit_lock); + } + spin_lock(&dev->queue_lock); + return -1; + } + if (ret == NETDEV_TX_LOCKED && nolock) { + spin_lock(&dev->queue_lock); + goto collision; + } + } + + /* NETDEV_TX_BUSY - we need to requeue */ + /* Release the driver */ + if (!nolock) { + dev->xmit_lock_owner = -1; + spin_unlock(&dev->xmit_lock); + } + spin_lock(&dev->queue_lock); + q = dev->qdisc; + } + + /* Device kicked us out :( + This is possible in three cases: + + 0. driver is locked + 1. fastroute is enabled + 2. device cannot determine busy state + before start of transmission (f.e. dialout) + 3. device is buggy (ppp) + */ + +requeue: + q->ops->requeue(skb, q); + netif_schedule(dev); + return 1; + } + return q->q.qlen; +} + +static void dev_watchdog(unsigned long arg) +{ + struct net_device *dev = (struct net_device *)arg; + + spin_lock(&dev->xmit_lock); + if (dev->qdisc != &noop_qdisc) { + if (netif_device_present(dev) && + netif_running(dev) && + netif_carrier_ok(dev)) { + if (netif_queue_stopped(dev) && + (jiffies - dev->trans_start) > dev->watchdog_timeo) { + printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name); + dev->tx_timeout(dev); + } + if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo)) + dev_hold(dev); + } + } + spin_unlock(&dev->xmit_lock); + + dev_put(dev); +} + +static void dev_watchdog_init(struct net_device *dev) +{ + init_timer(&dev->watchdog_timer); + dev->watchdog_timer.data = (unsigned long)dev; + dev->watchdog_timer.function = dev_watchdog; +} + +void __netdev_watchdog_up(struct net_device *dev) +{ + if (dev->tx_timeout) { + if (dev->watchdog_timeo <= 0) + dev->watchdog_timeo = 5*HZ; + if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo)) + dev_hold(dev); + } +} + +static void dev_watchdog_up(struct net_device *dev) +{ + spin_lock_bh(&dev->xmit_lock); + __netdev_watchdog_up(dev); + spin_unlock_bh(&dev->xmit_lock); +} + +static void dev_watchdog_down(struct net_device *dev) +{ + spin_lock_bh(&dev->xmit_lock); + if (del_timer(&dev->watchdog_timer)) + __dev_put(dev); + spin_unlock_bh(&dev->xmit_lock); +} + +/* "NOOP" scheduler: the best scheduler, recommended for all interfaces + under all circumstances. It is difficult to invent anything faster or + cheaper. + */ + +static int +noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) +{ + kfree_skb(skb); + return NET_XMIT_CN; +} + +static struct sk_buff * +noop_dequeue(struct Qdisc * qdisc) +{ + return NULL; +} + +static int +noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + if (net_ratelimit()) + printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name); + kfree_skb(skb); + return NET_XMIT_CN; +} + +struct Qdisc_ops noop_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "noop", + .priv_size = 0, + .enqueue = noop_enqueue, + .dequeue = noop_dequeue, + .requeue = noop_requeue, + .owner = THIS_MODULE, +}; + +struct Qdisc noop_qdisc = { + .enqueue = noop_enqueue, + .dequeue = noop_dequeue, + .flags = TCQ_F_BUILTIN, + .ops = &noop_qdisc_ops, + .list = LIST_HEAD_INIT(noop_qdisc.list), +}; + +static struct Qdisc_ops noqueue_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "noqueue", + .priv_size = 0, + .enqueue = noop_enqueue, + .dequeue = noop_dequeue, + .requeue = noop_requeue, + .owner = THIS_MODULE, +}; + +static struct Qdisc noqueue_qdisc = { + .enqueue = NULL, + .dequeue = noop_dequeue, + .flags = TCQ_F_BUILTIN, + .ops = &noqueue_qdisc_ops, + .list = LIST_HEAD_INIT(noqueue_qdisc.list), +}; + + +static const u8 prio2band[TC_PRIO_MAX+1] = + { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; + +/* 3-band FIFO queue: old style, but should be a bit faster than + generic prio+fifo combination. + */ + +static int +pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + struct sk_buff_head *list = qdisc_priv(qdisc); + + list += prio2band[skb->priority&TC_PRIO_MAX]; + + if (list->qlen < qdisc->dev->tx_queue_len) { + __skb_queue_tail(list, skb); + qdisc->q.qlen++; + qdisc->bstats.bytes += skb->len; + qdisc->bstats.packets++; + return 0; + } + qdisc->qstats.drops++; + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static struct sk_buff * +pfifo_fast_dequeue(struct Qdisc* qdisc) +{ + int prio; + struct sk_buff_head *list = qdisc_priv(qdisc); + struct sk_buff *skb; + + for (prio = 0; prio < 3; prio++, list++) { + skb = __skb_dequeue(list); + if (skb) { + qdisc->q.qlen--; + return skb; + } + } + return NULL; +} + +static int +pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + struct sk_buff_head *list = qdisc_priv(qdisc); + + list += prio2band[skb->priority&TC_PRIO_MAX]; + + __skb_queue_head(list, skb); + qdisc->q.qlen++; + qdisc->qstats.requeues++; + return 0; +} + +static void +pfifo_fast_reset(struct Qdisc* qdisc) +{ + int prio; + struct sk_buff_head *list = qdisc_priv(qdisc); + + for (prio=0; prio < 3; prio++) + skb_queue_purge(list+prio); + qdisc->q.qlen = 0; +} + +static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) +{ + unsigned char *b = skb->tail; + struct tc_prio_qopt opt; + + opt.bands = 3; + memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt) +{ + int i; + struct sk_buff_head *list = qdisc_priv(qdisc); + + for (i=0; i<3; i++) + skb_queue_head_init(list+i); + + return 0; +} + +static struct Qdisc_ops pfifo_fast_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "pfifo_fast", + .priv_size = 3 * sizeof(struct sk_buff_head), + .enqueue = pfifo_fast_enqueue, + .dequeue = pfifo_fast_dequeue, + .requeue = pfifo_fast_requeue, + .init = pfifo_fast_init, + .reset = pfifo_fast_reset, + .dump = pfifo_fast_dump, + .owner = THIS_MODULE, +}; + +struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) +{ + void *p; + struct Qdisc *sch; + int size; + + /* ensure that the Qdisc and the private data are 32-byte aligned */ + size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); + size += ops->priv_size + QDISC_ALIGN_CONST; + + p = kmalloc(size, GFP_KERNEL); + if (!p) + return NULL; + memset(p, 0, size); + + sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) + & ~QDISC_ALIGN_CONST); + sch->padded = (char *)sch - (char *)p; + + INIT_LIST_HEAD(&sch->list); + skb_queue_head_init(&sch->q); + sch->ops = ops; + sch->enqueue = ops->enqueue; + sch->dequeue = ops->dequeue; + sch->dev = dev; + dev_hold(dev); + sch->stats_lock = &dev->queue_lock; + atomic_set(&sch->refcnt, 1); + if (!ops->init || ops->init(sch, NULL) == 0) + return sch; + + dev_put(dev); + kfree(p); + return NULL; +} + +/* Under dev->queue_lock and BH! */ + +void qdisc_reset(struct Qdisc *qdisc) +{ + struct Qdisc_ops *ops = qdisc->ops; + + if (ops->reset) + ops->reset(qdisc); +} + +/* this is the rcu callback function to clean up a qdisc when there + * are no further references to it */ + +static void __qdisc_destroy(struct rcu_head *head) +{ + struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); + struct Qdisc_ops *ops = qdisc->ops; + +#ifdef CONFIG_NET_ESTIMATOR + gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); +#endif + write_lock(&qdisc_tree_lock); + if (ops->reset) + ops->reset(qdisc); + if (ops->destroy) + ops->destroy(qdisc); + write_unlock(&qdisc_tree_lock); + module_put(ops->owner); + + dev_put(qdisc->dev); + kfree((char *) qdisc - qdisc->padded); +} + +/* Under dev->queue_lock and BH! */ + +void qdisc_destroy(struct Qdisc *qdisc) +{ + struct list_head cql = LIST_HEAD_INIT(cql); + struct Qdisc *cq, *q, *n; + + if (qdisc->flags & TCQ_F_BUILTIN || + !atomic_dec_and_test(&qdisc->refcnt)) + return; + + if (!list_empty(&qdisc->list)) { + if (qdisc->ops->cl_ops == NULL) + list_del(&qdisc->list); + else + list_move(&qdisc->list, &cql); + } + + /* unlink inner qdiscs from dev->qdisc_list immediately */ + list_for_each_entry(cq, &cql, list) + list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list) + if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) { + if (q->ops->cl_ops == NULL) + list_del_init(&q->list); + else + list_move_tail(&q->list, &cql); + } + list_for_each_entry_safe(cq, n, &cql, list) + list_del_init(&cq->list); + + call_rcu(&qdisc->q_rcu, __qdisc_destroy); +} + +void dev_activate(struct net_device *dev) +{ + /* No queueing discipline is attached to device; + create default one i.e. pfifo_fast for devices, + which need queueing and noqueue_qdisc for + virtual interfaces + */ + + if (dev->qdisc_sleeping == &noop_qdisc) { + struct Qdisc *qdisc; + if (dev->tx_queue_len) { + qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops); + if (qdisc == NULL) { + printk(KERN_INFO "%s: activation failed\n", dev->name); + return; + } + write_lock_bh(&qdisc_tree_lock); + list_add_tail(&qdisc->list, &dev->qdisc_list); + write_unlock_bh(&qdisc_tree_lock); + } else { + qdisc = &noqueue_qdisc; + } + write_lock_bh(&qdisc_tree_lock); + dev->qdisc_sleeping = qdisc; + write_unlock_bh(&qdisc_tree_lock); + } + + spin_lock_bh(&dev->queue_lock); + rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping); + if (dev->qdisc != &noqueue_qdisc) { + dev->trans_start = jiffies; + dev_watchdog_up(dev); + } + spin_unlock_bh(&dev->queue_lock); +} + +void dev_deactivate(struct net_device *dev) +{ + struct Qdisc *qdisc; + + spin_lock_bh(&dev->queue_lock); + qdisc = dev->qdisc; + dev->qdisc = &noop_qdisc; + + qdisc_reset(qdisc); + + spin_unlock_bh(&dev->queue_lock); + + dev_watchdog_down(dev); + + while (test_bit(__LINK_STATE_SCHED, &dev->state)) + yield(); + + spin_unlock_wait(&dev->xmit_lock); +} + +void dev_init_scheduler(struct net_device *dev) +{ + qdisc_lock_tree(dev); + dev->qdisc = &noop_qdisc; + dev->qdisc_sleeping = &noop_qdisc; + INIT_LIST_HEAD(&dev->qdisc_list); + qdisc_unlock_tree(dev); + + dev_watchdog_init(dev); +} + +void dev_shutdown(struct net_device *dev) +{ + struct Qdisc *qdisc; + + qdisc_lock_tree(dev); + qdisc = dev->qdisc_sleeping; + dev->qdisc = &noop_qdisc; + dev->qdisc_sleeping = &noop_qdisc; + qdisc_destroy(qdisc); +#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE) + if ((qdisc = dev->qdisc_ingress) != NULL) { + dev->qdisc_ingress = NULL; + qdisc_destroy(qdisc); + } +#endif + BUG_TRAP(!timer_pending(&dev->watchdog_timer)); + qdisc_unlock_tree(dev); +} + +EXPORT_SYMBOL(__netdev_watchdog_up); +EXPORT_SYMBOL(noop_qdisc); +EXPORT_SYMBOL(noop_qdisc_ops); +EXPORT_SYMBOL(qdisc_create_dflt); +EXPORT_SYMBOL(qdisc_destroy); +EXPORT_SYMBOL(qdisc_reset); +EXPORT_SYMBOL(qdisc_restart); +EXPORT_SYMBOL(qdisc_lock_tree); +EXPORT_SYMBOL(qdisc_unlock_tree); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c new file mode 100644 index 00000000000..25c171c3271 --- /dev/null +++ b/net/sched/sch_gred.c @@ -0,0 +1,630 @@ +/* + * net/sched/sch_gred.c Generic Random Early Detection queue. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002 + * + * 991129: - Bug fix with grio mode + * - a better sing. AvgQ mode with Grio(WRED) + * - A finer grained VQ dequeue based on sugestion + * from Ren Liu + * - More error checks + * + * + * + * For all the glorious comments look at Alexey's sch_red.c + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +#if 1 /* control */ +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +#if 0 /* data */ +#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define D2PRINTK(format,args...) +#endif + +struct gred_sched_data; +struct gred_sched; + +struct gred_sched_data +{ +/* Parameters */ + u32 limit; /* HARD maximal queue length */ + u32 qth_min; /* Min average length threshold: A scaled */ + u32 qth_max; /* Max average length threshold: A scaled */ + u32 DP; /* the drop pramaters */ + char Wlog; /* log(W) */ + char Plog; /* random number bits */ + u32 Scell_max; + u32 Rmask; + u32 bytesin; /* bytes seen on virtualQ so far*/ + u32 packetsin; /* packets seen on virtualQ so far*/ + u32 backlog; /* bytes on the virtualQ */ + u32 forced; /* packets dropped for exceeding limits */ + u32 early; /* packets dropped as a warning */ + u32 other; /* packets dropped by invoking drop() */ + u32 pdrop; /* packets dropped because we exceeded physical queue limits */ + char Scell_log; + u8 Stab[256]; + u8 prio; /* the prio of this vq */ + +/* Variables */ + unsigned long qave; /* Average queue length: A scaled */ + int qcount; /* Packets since last random number generation */ + u32 qR; /* Cached random number */ + + psched_time_t qidlestart; /* Start of idle period */ +}; + +struct gred_sched +{ + struct gred_sched_data *tab[MAX_DPs]; + u32 DPs; + u32 def; + u8 initd; + u8 grio; + u8 eqp; +}; + +static int +gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + psched_time_t now; + struct gred_sched_data *q=NULL; + struct gred_sched *t= qdisc_priv(sch); + unsigned long qave=0; + int i=0; + + if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) { + D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n"); + goto do_enqueue; + } + + + if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) { + printk("GRED: setting to default (%d)\n ",t->def); + if (!(q=t->tab[t->def])) { + DPRINTK("GRED: setting to default FAILED! dropping!! " + "(%d)\n ", t->def); + goto drop; + } + /* fix tc_index? --could be controvesial but needed for + requeueing */ + skb->tc_index=(skb->tc_index&0xfffffff0) | t->def; + } + + D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d " + "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog, + sch->qstats.backlog); + /* sum up all the qaves of prios <= to ours to get the new qave*/ + if (!t->eqp && t->grio) { + for (i=0;i<t->DPs;i++) { + if ((!t->tab[i]) || (i==q->DP)) + continue; + + if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart))) + qave +=t->tab[i]->qave; + } + + } + + q->packetsin++; + q->bytesin+=skb->len; + + if (t->eqp && t->grio) { + qave=0; + q->qave=t->tab[t->def]->qave; + q->qidlestart=t->tab[t->def]->qidlestart; + } + + if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { + long us_idle; + PSCHED_GET_TIME(now); + us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); + PSCHED_SET_PASTPERFECT(q->qidlestart); + + q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; + } else { + if (t->eqp) { + q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); + } else { + q->qave += q->backlog - (q->qave >> q->Wlog); + } + + } + + + if (t->eqp && t->grio) + t->tab[t->def]->qave=q->qave; + + if ((q->qave+qave) < q->qth_min) { + q->qcount = -1; +enqueue: + if (q->backlog + skb->len <= q->limit) { + q->backlog += skb->len; +do_enqueue: + __skb_queue_tail(&sch->q, skb); + sch->qstats.backlog += skb->len; + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return 0; + } else { + q->pdrop++; + } + +drop: + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_DROP; + } + if ((q->qave+qave) >= q->qth_max) { + q->qcount = -1; + sch->qstats.overlimits++; + q->forced++; + goto drop; + } + if (++q->qcount) { + if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR) + goto enqueue; + q->qcount = 0; + q->qR = net_random()&q->Rmask; + sch->qstats.overlimits++; + q->early++; + goto drop; + } + q->qR = net_random()&q->Rmask; + goto enqueue; +} + +static int +gred_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct gred_sched_data *q; + struct gred_sched *t= qdisc_priv(sch); + q= t->tab[(skb->tc_index&0xf)]; +/* error checking here -- probably unnecessary */ + PSCHED_SET_PASTPERFECT(q->qidlestart); + + __skb_queue_head(&sch->q, skb); + sch->qstats.backlog += skb->len; + sch->qstats.requeues++; + q->backlog += skb->len; + return 0; +} + +static struct sk_buff * +gred_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct gred_sched_data *q; + struct gred_sched *t= qdisc_priv(sch); + + skb = __skb_dequeue(&sch->q); + if (skb) { + sch->qstats.backlog -= skb->len; + q= t->tab[(skb->tc_index&0xf)]; + if (q) { + q->backlog -= skb->len; + if (!q->backlog && !t->eqp) + PSCHED_GET_TIME(q->qidlestart); + } else { + D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); + } + return skb; + } + + if (t->eqp) { + q= t->tab[t->def]; + if (!q) + D2PRINTK("no default VQ set: Results will be " + "screwed up\n"); + else + PSCHED_GET_TIME(q->qidlestart); + } + + return NULL; +} + +static unsigned int gred_drop(struct Qdisc* sch) +{ + struct sk_buff *skb; + + struct gred_sched_data *q; + struct gred_sched *t= qdisc_priv(sch); + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + unsigned int len = skb->len; + sch->qstats.backlog -= len; + sch->qstats.drops++; + q= t->tab[(skb->tc_index&0xf)]; + if (q) { + q->backlog -= len; + q->other++; + if (!q->backlog && !t->eqp) + PSCHED_GET_TIME(q->qidlestart); + } else { + D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); + } + + kfree_skb(skb); + return len; + } + + q=t->tab[t->def]; + if (!q) { + D2PRINTK("no default VQ set: Results might be screwed up\n"); + return 0; + } + + PSCHED_GET_TIME(q->qidlestart); + return 0; + +} + +static void gred_reset(struct Qdisc* sch) +{ + int i; + struct gred_sched_data *q; + struct gred_sched *t= qdisc_priv(sch); + + __skb_queue_purge(&sch->q); + + sch->qstats.backlog = 0; + + for (i=0;i<t->DPs;i++) { + q= t->tab[i]; + if (!q) + continue; + PSCHED_SET_PASTPERFECT(q->qidlestart); + q->qave = 0; + q->qcount = -1; + q->backlog = 0; + q->other=0; + q->forced=0; + q->pdrop=0; + q->early=0; + } +} + +static int gred_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct gred_sched *table = qdisc_priv(sch); + struct gred_sched_data *q; + struct tc_gred_qopt *ctl; + struct tc_gred_sopt *sopt; + struct rtattr *tb[TCA_GRED_STAB]; + struct rtattr *tb2[TCA_GRED_DPS]; + int i; + + if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) + return -EINVAL; + + if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { + rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); + + if (tb2[TCA_GRED_DPS-1] == 0) + return -EINVAL; + + sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); + table->DPs=sopt->DPs; + table->def=sopt->def_DP; + table->grio=sopt->grio; + table->initd=0; + /* probably need to clear all the table DP entries as well */ + return 0; + } + + + if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 || + RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || + RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) + return -EINVAL; + + ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); + if (ctl->DP > MAX_DPs-1 ) { + /* misbehaving is punished! Put in the default drop probability */ + DPRINTK("\nGRED: DP %u not in the proper range fixed. New DP " + "set to default at %d\n",ctl->DP,table->def); + ctl->DP=table->def; + } + + if (table->tab[ctl->DP] == NULL) { + table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data), + GFP_KERNEL); + if (NULL == table->tab[ctl->DP]) + return -ENOMEM; + memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data))); + } + q= table->tab[ctl->DP]; + + if (table->grio) { + if (ctl->prio <=0) { + if (table->def && table->tab[table->def]) { + DPRINTK("\nGRED: DP %u does not have a prio" + "setting default to %d\n",ctl->DP, + table->tab[table->def]->prio); + q->prio=table->tab[table->def]->prio; + } else { + DPRINTK("\nGRED: DP %u does not have a prio" + " setting default to 8\n",ctl->DP); + q->prio=8; + } + } else { + q->prio=ctl->prio; + } + } else { + q->prio=8; + } + + + q->DP=ctl->DP; + q->Wlog = ctl->Wlog; + q->Plog = ctl->Plog; + q->limit = ctl->limit; + q->Scell_log = ctl->Scell_log; + q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; + q->Scell_max = (255<<q->Scell_log); + q->qth_min = ctl->qth_min<<ctl->Wlog; + q->qth_max = ctl->qth_max<<ctl->Wlog; + q->qave=0; + q->backlog=0; + q->qcount = -1; + q->other=0; + q->forced=0; + q->pdrop=0; + q->early=0; + + PSCHED_SET_PASTPERFECT(q->qidlestart); + memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); + + if ( table->initd && table->grio) { + /* this looks ugly but it's not in the fast path */ + for (i=0;i<table->DPs;i++) { + if ((!table->tab[i]) || (i==q->DP) ) + continue; + if (table->tab[i]->prio == q->prio ){ + /* WRED mode detected */ + table->eqp=1; + break; + } + } + } + + if (!table->initd) { + table->initd=1; + /* + the first entry also goes into the default until + over-written + */ + + if (table->tab[table->def] == NULL) { + table->tab[table->def]= + kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL); + if (NULL == table->tab[table->def]) + return -ENOMEM; + + memset(table->tab[table->def], 0, + (sizeof(struct gred_sched_data))); + } + q= table->tab[table->def]; + q->DP=table->def; + q->Wlog = ctl->Wlog; + q->Plog = ctl->Plog; + q->limit = ctl->limit; + q->Scell_log = ctl->Scell_log; + q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; + q->Scell_max = (255<<q->Scell_log); + q->qth_min = ctl->qth_min<<ctl->Wlog; + q->qth_max = ctl->qth_max<<ctl->Wlog; + + if (table->grio) + q->prio=table->tab[ctl->DP]->prio; + else + q->prio=8; + + q->qcount = -1; + PSCHED_SET_PASTPERFECT(q->qidlestart); + memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); + } + return 0; + +} + +static int gred_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct gred_sched *table = qdisc_priv(sch); + struct tc_gred_sopt *sopt; + struct rtattr *tb[TCA_GRED_STAB]; + struct rtattr *tb2[TCA_GRED_DPS]; + + if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) + return -EINVAL; + + if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { + rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); + + if (tb2[TCA_GRED_DPS-1] == 0) + return -EINVAL; + + sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); + table->DPs=sopt->DPs; + table->def=sopt->def_DP; + table->grio=sopt->grio; + table->initd=0; + return 0; + } + + DPRINTK("\n GRED_INIT error!\n"); + return -EINVAL; +} + +static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + unsigned long qave; + struct rtattr *rta; + struct tc_gred_qopt *opt = NULL ; + struct tc_gred_qopt *dst; + struct gred_sched *table = qdisc_priv(sch); + struct gred_sched_data *q; + int i; + unsigned char *b = skb->tail; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL); + + if (opt == NULL) { + DPRINTK("gred_dump:failed to malloc for %Zd\n", + sizeof(struct tc_gred_qopt)*MAX_DPs); + goto rtattr_failure; + } + + memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs); + + if (!table->initd) { + DPRINTK("NO GRED Queues setup!\n"); + } + + for (i=0;i<MAX_DPs;i++) { + dst= &opt[i]; + q= table->tab[i]; + + if (!q) { + /* hack -- fix at some point with proper message + This is how we indicate to tc that there is no VQ + at this DP */ + + dst->DP=MAX_DPs+i; + continue; + } + + dst->limit=q->limit; + dst->qth_min=q->qth_min>>q->Wlog; + dst->qth_max=q->qth_max>>q->Wlog; + dst->DP=q->DP; + dst->backlog=q->backlog; + if (q->qave) { + if (table->eqp && table->grio) { + q->qidlestart=table->tab[table->def]->qidlestart; + q->qave=table->tab[table->def]->qave; + } + if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { + long idle; + psched_time_t now; + PSCHED_GET_TIME(now); + idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); + qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF]; + dst->qave = qave >> q->Wlog; + + } else { + dst->qave = q->qave >> q->Wlog; + } + } else { + dst->qave = 0; + } + + + dst->Wlog = q->Wlog; + dst->Plog = q->Plog; + dst->Scell_log = q->Scell_log; + dst->other = q->other; + dst->forced = q->forced; + dst->early = q->early; + dst->pdrop = q->pdrop; + dst->prio = q->prio; + dst->packets=q->packetsin; + dst->bytesin=q->bytesin; + } + + RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt); + rta->rta_len = skb->tail - b; + + kfree(opt); + return skb->len; + +rtattr_failure: + if (opt) + kfree(opt); + DPRINTK("gred_dump: FAILURE!!!!\n"); + +/* also free the opt struct here */ + skb_trim(skb, b - skb->data); + return -1; +} + +static void gred_destroy(struct Qdisc *sch) +{ + struct gred_sched *table = qdisc_priv(sch); + int i; + + for (i = 0;i < table->DPs; i++) { + if (table->tab[i]) + kfree(table->tab[i]); + } +} + +static struct Qdisc_ops gred_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "gred", + .priv_size = sizeof(struct gred_sched), + .enqueue = gred_enqueue, + .dequeue = gred_dequeue, + .requeue = gred_requeue, + .drop = gred_drop, + .init = gred_init, + .reset = gred_reset, + .destroy = gred_destroy, + .change = gred_change, + .dump = gred_dump, + .owner = THIS_MODULE, +}; + +static int __init gred_module_init(void) +{ + return register_qdisc(&gred_qdisc_ops); +} +static void __exit gred_module_exit(void) +{ + unregister_qdisc(&gred_qdisc_ops); +} +module_init(gred_module_init) +module_exit(gred_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c new file mode 100644 index 00000000000..c26764bc410 --- /dev/null +++ b/net/sched/sch_hfsc.c @@ -0,0 +1,1822 @@ +/* + * Copyright (c) 2003 Patrick McHardy, <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * 2003-10-17 - Ported from altq + */ +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +/* + * H-FSC is described in Proceedings of SIGCOMM'97, + * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, + * Real-Time and Priority Service" + * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. + * + * Oleg Cherevko <olwi@aq.ml.com.ua> added the upperlimit for link-sharing. + * when a class has an upperlimit, the fit-time is computed from the + * upperlimit service curve. the link-sharing scheduler does not schedule + * a class whose fit-time exceeds the current time. + */ + +#include <linux/kernel.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/jiffies.h> +#include <linux/compiler.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/timer.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/init.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/pkt_sched.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> +#include <asm/system.h> +#include <asm/div64.h> + +#define HFSC_DEBUG 1 + +/* + * kernel internal service curve representation: + * coordinates are given by 64 bit unsigned integers. + * x-axis: unit is clock count. + * y-axis: unit is byte. + * + * The service curve parameters are converted to the internal + * representation. The slope values are scaled to avoid overflow. + * the inverse slope values as well as the y-projection of the 1st + * segment are kept in order to to avoid 64-bit divide operations + * that are expensive on 32-bit architectures. + */ + +struct internal_sc +{ + u64 sm1; /* scaled slope of the 1st segment */ + u64 ism1; /* scaled inverse-slope of the 1st segment */ + u64 dx; /* the x-projection of the 1st segment */ + u64 dy; /* the y-projection of the 1st segment */ + u64 sm2; /* scaled slope of the 2nd segment */ + u64 ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* runtime service curve */ +struct runtime_sc +{ + u64 x; /* current starting position on x-axis */ + u64 y; /* current starting position on y-axis */ + u64 sm1; /* scaled slope of the 1st segment */ + u64 ism1; /* scaled inverse-slope of the 1st segment */ + u64 dx; /* the x-projection of the 1st segment */ + u64 dy; /* the y-projection of the 1st segment */ + u64 sm2; /* scaled slope of the 2nd segment */ + u64 ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +enum hfsc_class_flags +{ + HFSC_RSC = 0x1, + HFSC_FSC = 0x2, + HFSC_USC = 0x4 +}; + +struct hfsc_class +{ + u32 classid; /* class id */ + unsigned int refcnt; /* usage count */ + + struct gnet_stats_basic bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est rate_est; + spinlock_t *stats_lock; + unsigned int level; /* class level in hierarchy */ + struct tcf_proto *filter_list; /* filter list */ + unsigned int filter_cnt; /* filter count */ + + struct hfsc_sched *sched; /* scheduler data */ + struct hfsc_class *cl_parent; /* parent class */ + struct list_head siblings; /* sibling classes */ + struct list_head children; /* child classes */ + struct Qdisc *qdisc; /* leaf qdisc */ + + struct rb_node el_node; /* qdisc's eligible tree member */ + struct rb_root vt_tree; /* active children sorted by cl_vt */ + struct rb_node vt_node; /* parent's vt_tree member */ + struct rb_root cf_tree; /* active children sorted by cl_f */ + struct rb_node cf_node; /* parent's cf_heap member */ + struct list_head hlist; /* hash list member */ + struct list_head dlist; /* drop list member */ + + u64 cl_total; /* total work in bytes */ + u64 cl_cumul; /* cumulative work in bytes done by + real-time criteria */ + + u64 cl_d; /* deadline*/ + u64 cl_e; /* eligible time */ + u64 cl_vt; /* virtual time */ + u64 cl_f; /* time when this class will fit for + link-sharing, max(myf, cfmin) */ + u64 cl_myf; /* my fit-time (calculated from this + class's own upperlimit curve) */ + u64 cl_myfadj; /* my fit-time adjustment (to cancel + history dependence) */ + u64 cl_cfmin; /* earliest children's fit-time (used + with cl_myf to obtain cl_f) */ + u64 cl_cvtmin; /* minimal virtual time among the + children fit for link-sharing + (monotonic within a period) */ + u64 cl_vtadj; /* intra-period cumulative vt + adjustment */ + u64 cl_vtoff; /* inter-period cumulative vt offset */ + u64 cl_cvtmax; /* max child's vt in the last period */ + u64 cl_cvtoff; /* cumulative cvtmax of all periods */ + u64 cl_pcvtoff; /* parent's cvtoff at initalization + time */ + + struct internal_sc cl_rsc; /* internal real-time service curve */ + struct internal_sc cl_fsc; /* internal fair service curve */ + struct internal_sc cl_usc; /* internal upperlimit service curve */ + struct runtime_sc cl_deadline; /* deadline curve */ + struct runtime_sc cl_eligible; /* eligible curve */ + struct runtime_sc cl_virtual; /* virtual curve */ + struct runtime_sc cl_ulimit; /* upperlimit curve */ + + unsigned long cl_flags; /* which curves are valid */ + unsigned long cl_vtperiod; /* vt period sequence number */ + unsigned long cl_parentperiod;/* parent's vt period sequence number*/ + unsigned long cl_nactive; /* number of active children */ +}; + +#define HFSC_HSIZE 16 + +struct hfsc_sched +{ + u16 defcls; /* default class id */ + struct hfsc_class root; /* root class */ + struct list_head clhash[HFSC_HSIZE]; /* class hash */ + struct rb_root eligible; /* eligible tree */ + struct list_head droplist; /* active leaf class list (for + dropping) */ + struct sk_buff_head requeue; /* requeued packet */ + struct timer_list wd_timer; /* watchdog timer */ +}; + +/* + * macros + */ +#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY +#include <linux/time.h> +#undef PSCHED_GET_TIME +#define PSCHED_GET_TIME(stamp) \ +do { \ + struct timeval tv; \ + do_gettimeofday(&tv); \ + (stamp) = 1000000ULL * tv.tv_sec + tv.tv_usec; \ +} while (0) +#endif + +#if HFSC_DEBUG +#define ASSERT(cond) \ +do { \ + if (unlikely(!(cond))) \ + printk("assertion %s failed at %s:%i (%s)\n", \ + #cond, __FILE__, __LINE__, __FUNCTION__); \ +} while (0) +#else +#define ASSERT(cond) +#endif /* HFSC_DEBUG */ + +#define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ + + +/* + * eligible tree holds backlogged classes being sorted by their eligible times. + * there is one eligible tree per hfsc instance. + */ + +static void +eltree_insert(struct hfsc_class *cl) +{ + struct rb_node **p = &cl->sched->eligible.rb_node; + struct rb_node *parent = NULL; + struct hfsc_class *cl1; + + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, el_node); + if (cl->cl_e >= cl1->cl_e) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->el_node, parent, p); + rb_insert_color(&cl->el_node, &cl->sched->eligible); +} + +static inline void +eltree_remove(struct hfsc_class *cl) +{ + rb_erase(&cl->el_node, &cl->sched->eligible); +} + +static inline void +eltree_update(struct hfsc_class *cl) +{ + eltree_remove(cl); + eltree_insert(cl); +} + +/* find the class with the minimum deadline among the eligible classes */ +static inline struct hfsc_class * +eltree_get_mindl(struct hfsc_sched *q, u64 cur_time) +{ + struct hfsc_class *p, *cl = NULL; + struct rb_node *n; + + for (n = rb_first(&q->eligible); n != NULL; n = rb_next(n)) { + p = rb_entry(n, struct hfsc_class, el_node); + if (p->cl_e > cur_time) + break; + if (cl == NULL || p->cl_d < cl->cl_d) + cl = p; + } + return cl; +} + +/* find the class with minimum eligible time among the eligible classes */ +static inline struct hfsc_class * +eltree_get_minel(struct hfsc_sched *q) +{ + struct rb_node *n; + + n = rb_first(&q->eligible); + if (n == NULL) + return NULL; + return rb_entry(n, struct hfsc_class, el_node); +} + +/* + * vttree holds holds backlogged child classes being sorted by their virtual + * time. each intermediate class has one vttree. + */ +static void +vttree_insert(struct hfsc_class *cl) +{ + struct rb_node **p = &cl->cl_parent->vt_tree.rb_node; + struct rb_node *parent = NULL; + struct hfsc_class *cl1; + + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, vt_node); + if (cl->cl_vt >= cl1->cl_vt) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->vt_node, parent, p); + rb_insert_color(&cl->vt_node, &cl->cl_parent->vt_tree); +} + +static inline void +vttree_remove(struct hfsc_class *cl) +{ + rb_erase(&cl->vt_node, &cl->cl_parent->vt_tree); +} + +static inline void +vttree_update(struct hfsc_class *cl) +{ + vttree_remove(cl); + vttree_insert(cl); +} + +static inline struct hfsc_class * +vttree_firstfit(struct hfsc_class *cl, u64 cur_time) +{ + struct hfsc_class *p; + struct rb_node *n; + + for (n = rb_first(&cl->vt_tree); n != NULL; n = rb_next(n)) { + p = rb_entry(n, struct hfsc_class, vt_node); + if (p->cl_f <= cur_time) + return p; + } + return NULL; +} + +/* + * get the leaf class with the minimum vt in the hierarchy + */ +static struct hfsc_class * +vttree_get_minvt(struct hfsc_class *cl, u64 cur_time) +{ + /* if root-class's cfmin is bigger than cur_time nothing to do */ + if (cl->cl_cfmin > cur_time) + return NULL; + + while (cl->level > 0) { + cl = vttree_firstfit(cl, cur_time); + if (cl == NULL) + return NULL; + /* + * update parent's cl_cvtmin. + */ + if (cl->cl_parent->cl_cvtmin < cl->cl_vt) + cl->cl_parent->cl_cvtmin = cl->cl_vt; + } + return cl; +} + +static void +cftree_insert(struct hfsc_class *cl) +{ + struct rb_node **p = &cl->cl_parent->cf_tree.rb_node; + struct rb_node *parent = NULL; + struct hfsc_class *cl1; + + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, cf_node); + if (cl->cl_f >= cl1->cl_f) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->cf_node, parent, p); + rb_insert_color(&cl->cf_node, &cl->cl_parent->cf_tree); +} + +static inline void +cftree_remove(struct hfsc_class *cl) +{ + rb_erase(&cl->cf_node, &cl->cl_parent->cf_tree); +} + +static inline void +cftree_update(struct hfsc_class *cl) +{ + cftree_remove(cl); + cftree_insert(cl); +} + +/* + * service curve support functions + * + * external service curve parameters + * m: bps + * d: us + * internal service curve parameters + * sm: (bytes/psched_us) << SM_SHIFT + * ism: (psched_us/byte) << ISM_SHIFT + * dx: psched_us + * + * Clock source resolution (CONFIG_NET_SCH_CLK_*) + * JIFFIES: for 48<=HZ<=1534 resolution is between 0.63us and 1.27us. + * CPU: resolution is between 0.5us and 1us. + * GETTIMEOFDAY: resolution is exactly 1us. + * + * sm and ism are scaled in order to keep effective digits. + * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective + * digits in decimal using the following table. + * + * Note: We can afford the additional accuracy (altq hfsc keeps at most + * 3 effective digits) thanks to the fact that linux clock is bounded + * much more tightly. + * + * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps + * ------------+------------------------------------------------------- + * bytes/0.5us 6.25e-3 62.5e-3 625e-3 6250e-e 62500e-3 + * bytes/us 12.5e-3 125e-3 1250e-3 12500e-3 125000e-3 + * bytes/1.27us 15.875e-3 158.75e-3 1587.5e-3 15875e-3 158750e-3 + * + * 0.5us/byte 160 16 1.6 0.16 0.016 + * us/byte 80 8 0.8 0.08 0.008 + * 1.27us/byte 63 6.3 0.63 0.063 0.0063 + */ +#define SM_SHIFT 20 +#define ISM_SHIFT 18 + +#define SM_MASK ((1ULL << SM_SHIFT) - 1) +#define ISM_MASK ((1ULL << ISM_SHIFT) - 1) + +static inline u64 +seg_x2y(u64 x, u64 sm) +{ + u64 y; + + /* + * compute + * y = x * sm >> SM_SHIFT + * but divide it for the upper and lower bits to avoid overflow + */ + y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT); + return y; +} + +static inline u64 +seg_y2x(u64 y, u64 ism) +{ + u64 x; + + if (y == 0) + x = 0; + else if (ism == HT_INFINITY) + x = HT_INFINITY; + else { + x = (y >> ISM_SHIFT) * ism + + (((y & ISM_MASK) * ism) >> ISM_SHIFT); + } + return x; +} + +/* Convert m (bps) into sm (bytes/psched us) */ +static u64 +m2sm(u32 m) +{ + u64 sm; + + sm = ((u64)m << SM_SHIFT); + sm += PSCHED_JIFFIE2US(HZ) - 1; + do_div(sm, PSCHED_JIFFIE2US(HZ)); + return sm; +} + +/* convert m (bps) into ism (psched us/byte) */ +static u64 +m2ism(u32 m) +{ + u64 ism; + + if (m == 0) + ism = HT_INFINITY; + else { + ism = ((u64)PSCHED_JIFFIE2US(HZ) << ISM_SHIFT); + ism += m - 1; + do_div(ism, m); + } + return ism; +} + +/* convert d (us) into dx (psched us) */ +static u64 +d2dx(u32 d) +{ + u64 dx; + + dx = ((u64)d * PSCHED_JIFFIE2US(HZ)); + dx += 1000000 - 1; + do_div(dx, 1000000); + return dx; +} + +/* convert sm (bytes/psched us) into m (bps) */ +static u32 +sm2m(u64 sm) +{ + u64 m; + + m = (sm * PSCHED_JIFFIE2US(HZ)) >> SM_SHIFT; + return (u32)m; +} + +/* convert dx (psched us) into d (us) */ +static u32 +dx2d(u64 dx) +{ + u64 d; + + d = dx * 1000000; + do_div(d, PSCHED_JIFFIE2US(HZ)); + return (u32)d; +} + +static void +sc2isc(struct tc_service_curve *sc, struct internal_sc *isc) +{ + isc->sm1 = m2sm(sc->m1); + isc->ism1 = m2ism(sc->m1); + isc->dx = d2dx(sc->d); + isc->dy = seg_x2y(isc->dx, isc->sm1); + isc->sm2 = m2sm(sc->m2); + isc->ism2 = m2ism(sc->m2); +} + +/* + * initialize the runtime service curve with the given internal + * service curve starting at (x, y). + */ +static void +rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) +{ + rtsc->x = x; + rtsc->y = y; + rtsc->sm1 = isc->sm1; + rtsc->ism1 = isc->ism1; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + rtsc->sm2 = isc->sm2; + rtsc->ism2 = isc->ism2; +} + +/* + * calculate the y-projection of the runtime service curve by the + * given x-projection value + */ +static u64 +rtsc_y2x(struct runtime_sc *rtsc, u64 y) +{ + u64 x; + + if (y < rtsc->y) + x = rtsc->x; + else if (y <= rtsc->y + rtsc->dy) { + /* x belongs to the 1st segment */ + if (rtsc->dy == 0) + x = rtsc->x + rtsc->dx; + else + x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); + } else { + /* x belongs to the 2nd segment */ + x = rtsc->x + rtsc->dx + + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); + } + return x; +} + +static u64 +rtsc_x2y(struct runtime_sc *rtsc, u64 x) +{ + u64 y; + + if (x <= rtsc->x) + y = rtsc->y; + else if (x <= rtsc->x + rtsc->dx) + /* y belongs to the 1st segment */ + y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); + else + /* y belongs to the 2nd segment */ + y = rtsc->y + rtsc->dy + + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); + return y; +} + +/* + * update the runtime service curve by taking the minimum of the current + * runtime service curve and the service curve starting at (x, y). + */ +static void +rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) +{ + u64 y1, y2, dx, dy; + u32 dsm; + + if (isc->sm1 <= isc->sm2) { + /* service curve is convex */ + y1 = rtsc_x2y(rtsc, x); + if (y1 < y) + /* the current rtsc is smaller */ + return; + rtsc->x = x; + rtsc->y = y; + return; + } + + /* + * service curve is concave + * compute the two y values of the current rtsc + * y1: at x + * y2: at (x + dx) + */ + y1 = rtsc_x2y(rtsc, x); + if (y1 <= y) { + /* rtsc is below isc, no change to rtsc */ + return; + } + + y2 = rtsc_x2y(rtsc, x + isc->dx); + if (y2 >= y + isc->dy) { + /* rtsc is above isc, replace rtsc by isc */ + rtsc->x = x; + rtsc->y = y; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + return; + } + + /* + * the two curves intersect + * compute the offsets (dx, dy) using the reverse + * function of seg_x2y() + * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) + */ + dx = (y1 - y) << SM_SHIFT; + dsm = isc->sm1 - isc->sm2; + do_div(dx, dsm); + /* + * check if (x, y1) belongs to the 1st segment of rtsc. + * if so, add the offset. + */ + if (rtsc->x + rtsc->dx > x) + dx += rtsc->x + rtsc->dx - x; + dy = seg_x2y(dx, isc->sm1); + + rtsc->x = x; + rtsc->y = y; + rtsc->dx = dx; + rtsc->dy = dy; + return; +} + +static void +init_ed(struct hfsc_class *cl, unsigned int next_len) +{ + u64 cur_time; + + PSCHED_GET_TIME(cur_time); + + /* update the deadline curve */ + rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); + + /* + * update the eligible curve. + * for concave, it is equal to the deadline curve. + * for convex, it is a linear curve with slope m2. + */ + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + + /* compute e and d */ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + eltree_insert(cl); +} + +static void +update_ed(struct hfsc_class *cl, unsigned int next_len) +{ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + eltree_update(cl); +} + +static inline void +update_d(struct hfsc_class *cl, unsigned int next_len) +{ + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); +} + +static inline void +update_cfmin(struct hfsc_class *cl) +{ + struct rb_node *n = rb_first(&cl->cf_tree); + struct hfsc_class *p; + + if (n == NULL) { + cl->cl_cfmin = 0; + return; + } + p = rb_entry(n, struct hfsc_class, cf_node); + cl->cl_cfmin = p->cl_f; +} + +static void +init_vf(struct hfsc_class *cl, unsigned int len) +{ + struct hfsc_class *max_cl; + struct rb_node *n; + u64 vt, f, cur_time; + int go_active; + + cur_time = 0; + go_active = 1; + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + if (go_active && cl->cl_nactive++ == 0) + go_active = 1; + else + go_active = 0; + + if (go_active) { + n = rb_last(&cl->cl_parent->vt_tree); + if (n != NULL) { + max_cl = rb_entry(n, struct hfsc_class,vt_node); + /* + * set vt to the average of the min and max + * classes. if the parent's period didn't + * change, don't decrease vt of the class. + */ + vt = max_cl->cl_vt; + if (cl->cl_parent->cl_cvtmin != 0) + vt = (cl->cl_parent->cl_cvtmin + vt)/2; + + if (cl->cl_parent->cl_vtperiod != + cl->cl_parentperiod || vt > cl->cl_vt) + cl->cl_vt = vt; + } else { + /* + * first child for a new parent backlog period. + * add parent's cvtmax to cvtoff to make a new + * vt (vtoff + vt) larger than the vt in the + * last period for all children. + */ + vt = cl->cl_parent->cl_cvtmax; + cl->cl_parent->cl_cvtoff += vt; + cl->cl_parent->cl_cvtmax = 0; + cl->cl_parent->cl_cvtmin = 0; + cl->cl_vt = 0; + } + + cl->cl_vtoff = cl->cl_parent->cl_cvtoff - + cl->cl_pcvtoff; + + /* update the virtual curve */ + vt = cl->cl_vt + cl->cl_vtoff; + rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt, + cl->cl_total); + if (cl->cl_virtual.x == vt) { + cl->cl_virtual.x -= cl->cl_vtoff; + cl->cl_vtoff = 0; + } + cl->cl_vtadj = 0; + + cl->cl_vtperiod++; /* increment vt period */ + cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; + if (cl->cl_parent->cl_nactive == 0) + cl->cl_parentperiod++; + cl->cl_f = 0; + + vttree_insert(cl); + cftree_insert(cl); + + if (cl->cl_flags & HFSC_USC) { + /* class has upper limit curve */ + if (cur_time == 0) + PSCHED_GET_TIME(cur_time); + + /* update the ulimit curve */ + rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time, + cl->cl_total); + /* compute myf */ + cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, + cl->cl_total); + cl->cl_myfadj = 0; + } + } + + f = max(cl->cl_myf, cl->cl_cfmin); + if (f != cl->cl_f) { + cl->cl_f = f; + cftree_update(cl); + update_cfmin(cl->cl_parent); + } + } +} + +static void +update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) +{ + u64 f; /* , myf_bound, delta; */ + int go_passive = 0; + + if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC) + go_passive = 1; + + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + cl->cl_total += len; + + if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0) + continue; + + if (go_passive && --cl->cl_nactive == 0) + go_passive = 1; + else + go_passive = 0; + + if (go_passive) { + /* no more active child, going passive */ + + /* update cvtmax of the parent class */ + if (cl->cl_vt > cl->cl_parent->cl_cvtmax) + cl->cl_parent->cl_cvtmax = cl->cl_vt; + + /* remove this class from the vt tree */ + vttree_remove(cl); + + cftree_remove(cl); + update_cfmin(cl->cl_parent); + + continue; + } + + /* + * update vt and f + */ + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + - cl->cl_vtoff + cl->cl_vtadj; + + /* + * if vt of the class is smaller than cvtmin, + * the class was skipped in the past due to non-fit. + * if so, we need to adjust vtadj. + */ + if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { + cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; + cl->cl_vt = cl->cl_parent->cl_cvtmin; + } + + /* update the vt tree */ + vttree_update(cl); + + if (cl->cl_flags & HFSC_USC) { + cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, + cl->cl_total); +#if 0 + /* + * This code causes classes to stay way under their + * limit when multiple classes are used at gigabit + * speed. needs investigation. -kaber + */ + /* + * if myf lags behind by more than one clock tick + * from the current time, adjust myfadj to prevent + * a rate-limited class from going greedy. + * in a steady state under rate-limiting, myf + * fluctuates within one clock tick. + */ + myf_bound = cur_time - PSCHED_JIFFIE2US(1); + if (cl->cl_myf < myf_bound) { + delta = cur_time - cl->cl_myf; + cl->cl_myfadj += delta; + cl->cl_myf += delta; + } +#endif + } + + f = max(cl->cl_myf, cl->cl_cfmin); + if (f != cl->cl_f) { + cl->cl_f = f; + cftree_update(cl); + update_cfmin(cl->cl_parent); + } + } +} + +static void +set_active(struct hfsc_class *cl, unsigned int len) +{ + if (cl->cl_flags & HFSC_RSC) + init_ed(cl, len); + if (cl->cl_flags & HFSC_FSC) + init_vf(cl, len); + + list_add_tail(&cl->dlist, &cl->sched->droplist); +} + +static void +set_passive(struct hfsc_class *cl) +{ + if (cl->cl_flags & HFSC_RSC) + eltree_remove(cl); + + list_del(&cl->dlist); + + /* + * vttree is now handled in update_vf() so that update_vf(cl, 0, 0) + * needs to be called explicitly to remove a class from vttree. + */ +} + +/* + * hack to get length of first packet in queue. + */ +static unsigned int +qdisc_peek_len(struct Qdisc *sch) +{ + struct sk_buff *skb; + unsigned int len; + + skb = sch->dequeue(sch); + if (skb == NULL) { + if (net_ratelimit()) + printk("qdisc_peek_len: non work-conserving qdisc ?\n"); + return 0; + } + len = skb->len; + if (unlikely(sch->ops->requeue(skb, sch) != NET_XMIT_SUCCESS)) { + if (net_ratelimit()) + printk("qdisc_peek_len: failed to requeue\n"); + return 0; + } + return len; +} + +static void +hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl) +{ + unsigned int len = cl->qdisc->q.qlen; + + qdisc_reset(cl->qdisc); + if (len > 0) { + update_vf(cl, 0, 0); + set_passive(cl); + sch->q.qlen -= len; + } +} + +static void +hfsc_adjust_levels(struct hfsc_class *cl) +{ + struct hfsc_class *p; + unsigned int level; + + do { + level = 0; + list_for_each_entry(p, &cl->children, siblings) { + if (p->level > level) + level = p->level; + } + cl->level = level + 1; + } while ((cl = cl->cl_parent) != NULL); +} + +static inline unsigned int +hfsc_hash(u32 h) +{ + h ^= h >> 8; + h ^= h >> 4; + + return h & (HFSC_HSIZE - 1); +} + +static inline struct hfsc_class * +hfsc_find_class(u32 classid, struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + + list_for_each_entry(cl, &q->clhash[hfsc_hash(classid)], hlist) { + if (cl->classid == classid) + return cl; + } + return NULL; +} + +static void +hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc, + u64 cur_time) +{ + sc2isc(rsc, &cl->cl_rsc); + rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + cl->cl_flags |= HFSC_RSC; +} + +static void +hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc) +{ + sc2isc(fsc, &cl->cl_fsc); + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); + cl->cl_flags |= HFSC_FSC; +} + +static void +hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc, + u64 cur_time) +{ + sc2isc(usc, &cl->cl_usc); + rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total); + cl->cl_flags |= HFSC_USC; +} + +static int +hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct rtattr **tca, unsigned long *arg) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl = (struct hfsc_class *)*arg; + struct hfsc_class *parent = NULL; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_HFSC_MAX]; + struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL; + u64 cur_time; + + if (opt == NULL || rtattr_parse_nested(tb, TCA_HFSC_MAX, opt)) + return -EINVAL; + + if (tb[TCA_HFSC_RSC-1]) { + if (RTA_PAYLOAD(tb[TCA_HFSC_RSC-1]) < sizeof(*rsc)) + return -EINVAL; + rsc = RTA_DATA(tb[TCA_HFSC_RSC-1]); + if (rsc->m1 == 0 && rsc->m2 == 0) + rsc = NULL; + } + + if (tb[TCA_HFSC_FSC-1]) { + if (RTA_PAYLOAD(tb[TCA_HFSC_FSC-1]) < sizeof(*fsc)) + return -EINVAL; + fsc = RTA_DATA(tb[TCA_HFSC_FSC-1]); + if (fsc->m1 == 0 && fsc->m2 == 0) + fsc = NULL; + } + + if (tb[TCA_HFSC_USC-1]) { + if (RTA_PAYLOAD(tb[TCA_HFSC_USC-1]) < sizeof(*usc)) + return -EINVAL; + usc = RTA_DATA(tb[TCA_HFSC_USC-1]); + if (usc->m1 == 0 && usc->m2 == 0) + usc = NULL; + } + + if (cl != NULL) { + if (parentid) { + if (cl->cl_parent && cl->cl_parent->classid != parentid) + return -EINVAL; + if (cl->cl_parent == NULL && parentid != TC_H_ROOT) + return -EINVAL; + } + PSCHED_GET_TIME(cur_time); + + sch_tree_lock(sch); + if (rsc != NULL) + hfsc_change_rsc(cl, rsc, cur_time); + if (fsc != NULL) + hfsc_change_fsc(cl, fsc); + if (usc != NULL) + hfsc_change_usc(cl, usc, cur_time); + + if (cl->qdisc->q.qlen != 0) { + if (cl->cl_flags & HFSC_RSC) + update_ed(cl, qdisc_peek_len(cl->qdisc)); + if (cl->cl_flags & HFSC_FSC) + update_vf(cl, 0, cur_time); + } + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_replace_estimator(&cl->bstats, &cl->rate_est, + cl->stats_lock, tca[TCA_RATE-1]); +#endif + return 0; + } + + if (parentid == TC_H_ROOT) + return -EEXIST; + + parent = &q->root; + if (parentid) { + parent = hfsc_find_class(parentid, sch); + if (parent == NULL) + return -ENOENT; + } + + if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0) + return -EINVAL; + if (hfsc_find_class(classid, sch)) + return -EEXIST; + + if (rsc == NULL && fsc == NULL) + return -EINVAL; + + cl = kmalloc(sizeof(struct hfsc_class), GFP_KERNEL); + if (cl == NULL) + return -ENOBUFS; + memset(cl, 0, sizeof(struct hfsc_class)); + + if (rsc != NULL) + hfsc_change_rsc(cl, rsc, 0); + if (fsc != NULL) + hfsc_change_fsc(cl, fsc); + if (usc != NULL) + hfsc_change_usc(cl, usc, 0); + + cl->refcnt = 1; + cl->classid = classid; + cl->sched = q; + cl->cl_parent = parent; + cl->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (cl->qdisc == NULL) + cl->qdisc = &noop_qdisc; + cl->stats_lock = &sch->dev->queue_lock; + INIT_LIST_HEAD(&cl->children); + cl->vt_tree = RB_ROOT; + cl->cf_tree = RB_ROOT; + + sch_tree_lock(sch); + list_add_tail(&cl->hlist, &q->clhash[hfsc_hash(classid)]); + list_add_tail(&cl->siblings, &parent->children); + if (parent->level == 0) + hfsc_purge_queue(sch, parent); + hfsc_adjust_levels(parent); + cl->cl_pcvtoff = parent->cl_cvtoff; + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_new_estimator(&cl->bstats, &cl->rate_est, + cl->stats_lock, tca[TCA_RATE-1]); +#endif + *arg = (unsigned long)cl; + return 0; +} + +static void +hfsc_destroy_filters(struct tcf_proto **fl) +{ + struct tcf_proto *tp; + + while ((tp = *fl) != NULL) { + *fl = tp->next; + tcf_destroy(tp); + } +} + +static void +hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) +{ + struct hfsc_sched *q = qdisc_priv(sch); + + hfsc_destroy_filters(&cl->filter_list); + qdisc_destroy(cl->qdisc); +#ifdef CONFIG_NET_ESTIMATOR + gen_kill_estimator(&cl->bstats, &cl->rate_est); +#endif + if (cl != &q->root) + kfree(cl); +} + +static int +hfsc_delete_class(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root) + return -EBUSY; + + sch_tree_lock(sch); + + list_del(&cl->hlist); + list_del(&cl->siblings); + hfsc_adjust_levels(cl->cl_parent); + hfsc_purge_queue(sch, cl); + if (--cl->refcnt == 0) + hfsc_destroy_class(sch, cl); + + sch_tree_unlock(sch); + return 0; +} + +static struct hfsc_class * +hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + struct tcf_result res; + struct tcf_proto *tcf; + int result; + + if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 && + (cl = hfsc_find_class(skb->priority, sch)) != NULL) + if (cl->level == 0) + return cl; + + *qerr = NET_XMIT_DROP; + tcf = q->root.filter_list; + while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS; + case TC_ACT_SHOT: + return NULL; + } +#elif defined(CONFIG_NET_CLS_POLICE) + if (result == TC_POLICE_SHOT) + return NULL; +#endif + if ((cl = (struct hfsc_class *)res.class) == NULL) { + if ((cl = hfsc_find_class(res.classid, sch)) == NULL) + break; /* filter selected invalid classid */ + } + + if (cl->level == 0) + return cl; /* hit leaf class */ + + /* apply inner filter chain */ + tcf = cl->filter_list; + } + + /* classification failed, try default class */ + cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); + if (cl == NULL || cl->level > 0) + return NULL; + + return cl; +} + +static int +hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl == NULL) + return -ENOENT; + if (cl->level > 0) + return -EINVAL; + if (new == NULL) { + new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (new == NULL) + new = &noop_qdisc; + } + + sch_tree_lock(sch); + hfsc_purge_queue(sch, cl); + *old = xchg(&cl->qdisc, new); + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc * +hfsc_class_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl != NULL && cl->level == 0) + return cl->qdisc; + + return NULL; +} + +static unsigned long +hfsc_get_class(struct Qdisc *sch, u32 classid) +{ + struct hfsc_class *cl = hfsc_find_class(classid, sch); + + if (cl != NULL) + cl->refcnt++; + + return (unsigned long)cl; +} + +static void +hfsc_put_class(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (--cl->refcnt == 0) + hfsc_destroy_class(sch, cl); +} + +static unsigned long +hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) +{ + struct hfsc_class *p = (struct hfsc_class *)parent; + struct hfsc_class *cl = hfsc_find_class(classid, sch); + + if (cl != NULL) { + if (p != NULL && p->level <= cl->level) + return 0; + cl->filter_cnt++; + } + + return (unsigned long)cl; +} + +static void +hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + cl->filter_cnt--; +} + +static struct tcf_proto ** +hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl == NULL) + cl = &q->root; + + return &cl->filter_list; +} + +static int +hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc) +{ + struct tc_service_curve tsc; + + tsc.m1 = sm2m(sc->sm1); + tsc.d = dx2d(sc->dx); + tsc.m2 = sm2m(sc->sm2); + RTA_PUT(skb, attr, sizeof(tsc), &tsc); + + return skb->len; + + rtattr_failure: + return -1; +} + +static inline int +hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl) +{ + if ((cl->cl_flags & HFSC_RSC) && + (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0)) + goto rtattr_failure; + + if ((cl->cl_flags & HFSC_FSC) && + (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0)) + goto rtattr_failure; + + if ((cl->cl_flags & HFSC_USC) && + (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0)) + goto rtattr_failure; + + return skb->len; + + rtattr_failure: + return -1; +} + +static int +hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, + struct tcmsg *tcm) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + unsigned char *b = skb->tail; + struct rtattr *rta = (struct rtattr *)b; + + tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->classid : TC_H_ROOT; + tcm->tcm_handle = cl->classid; + if (cl->level == 0) + tcm->tcm_info = cl->qdisc->handle; + + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (hfsc_dump_curves(skb, cl) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + return skb->len; + + rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + struct tc_hfsc_stats xstats; + + cl->qstats.qlen = cl->qdisc->q.qlen; + xstats.level = cl->level; + xstats.period = cl->cl_vtperiod; + xstats.work = cl->cl_total; + xstats.rtwork = cl->cl_cumul; + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || +#ifdef CONFIG_NET_ESTIMATOR + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || +#endif + gnet_stats_copy_queue(d, &cl->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); +} + + + +static void +hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < HFSC_HSIZE; i++) { + list_for_each_entry(cl, &q->clhash[i], hlist) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static void +hfsc_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + +static void +hfsc_schedule_watchdog(struct Qdisc *sch, u64 cur_time) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + u64 next_time = 0; + long delay; + + if ((cl = eltree_get_minel(q)) != NULL) + next_time = cl->cl_e; + if (q->root.cl_cfmin != 0) { + if (next_time == 0 || next_time > q->root.cl_cfmin) + next_time = q->root.cl_cfmin; + } + ASSERT(next_time != 0); + delay = next_time - cur_time; + delay = PSCHED_US2JIFFIE(delay); + + sch->flags |= TCQ_F_THROTTLED; + mod_timer(&q->wd_timer, jiffies + delay); +} + +static int +hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct tc_hfsc_qopt *qopt; + unsigned int i; + + if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + return -EINVAL; + qopt = RTA_DATA(opt); + + sch->stats_lock = &sch->dev->queue_lock; + + q->defcls = qopt->defcls; + for (i = 0; i < HFSC_HSIZE; i++) + INIT_LIST_HEAD(&q->clhash[i]); + q->eligible = RB_ROOT; + INIT_LIST_HEAD(&q->droplist); + skb_queue_head_init(&q->requeue); + + q->root.refcnt = 1; + q->root.classid = sch->handle; + q->root.sched = q; + q->root.qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (q->root.qdisc == NULL) + q->root.qdisc = &noop_qdisc; + q->root.stats_lock = &sch->dev->queue_lock; + INIT_LIST_HEAD(&q->root.children); + q->root.vt_tree = RB_ROOT; + q->root.cf_tree = RB_ROOT; + + list_add(&q->root.hlist, &q->clhash[hfsc_hash(q->root.classid)]); + + init_timer(&q->wd_timer); + q->wd_timer.function = hfsc_watchdog; + q->wd_timer.data = (unsigned long)sch; + + return 0; +} + +static int +hfsc_change_qdisc(struct Qdisc *sch, struct rtattr *opt) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct tc_hfsc_qopt *qopt; + + if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + return -EINVAL; + qopt = RTA_DATA(opt); + + sch_tree_lock(sch); + q->defcls = qopt->defcls; + sch_tree_unlock(sch); + + return 0; +} + +static void +hfsc_reset_class(struct hfsc_class *cl) +{ + cl->cl_total = 0; + cl->cl_cumul = 0; + cl->cl_d = 0; + cl->cl_e = 0; + cl->cl_vt = 0; + cl->cl_vtadj = 0; + cl->cl_vtoff = 0; + cl->cl_cvtmin = 0; + cl->cl_cvtmax = 0; + cl->cl_cvtoff = 0; + cl->cl_pcvtoff = 0; + cl->cl_vtperiod = 0; + cl->cl_parentperiod = 0; + cl->cl_f = 0; + cl->cl_myf = 0; + cl->cl_myfadj = 0; + cl->cl_cfmin = 0; + cl->cl_nactive = 0; + + cl->vt_tree = RB_ROOT; + cl->cf_tree = RB_ROOT; + qdisc_reset(cl->qdisc); + + if (cl->cl_flags & HFSC_RSC) + rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0); + if (cl->cl_flags & HFSC_FSC) + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0); + if (cl->cl_flags & HFSC_USC) + rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0); +} + +static void +hfsc_reset_qdisc(struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + unsigned int i; + + for (i = 0; i < HFSC_HSIZE; i++) { + list_for_each_entry(cl, &q->clhash[i], hlist) + hfsc_reset_class(cl); + } + __skb_queue_purge(&q->requeue); + q->eligible = RB_ROOT; + INIT_LIST_HEAD(&q->droplist); + del_timer(&q->wd_timer); + sch->flags &= ~TCQ_F_THROTTLED; + sch->q.qlen = 0; +} + +static void +hfsc_destroy_qdisc(struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl, *next; + unsigned int i; + + for (i = 0; i < HFSC_HSIZE; i++) { + list_for_each_entry_safe(cl, next, &q->clhash[i], hlist) + hfsc_destroy_class(sch, cl); + } + __skb_queue_purge(&q->requeue); + del_timer(&q->wd_timer); +} + +static int +hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) +{ + struct hfsc_sched *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct tc_hfsc_qopt qopt; + + qopt.defcls = q->defcls; + RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + return skb->len; + + rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hfsc_class *cl; + unsigned int len; + int err; + + cl = hfsc_classify(skb, sch, &err); + if (cl == NULL) { + if (err == NET_XMIT_DROP) + sch->qstats.drops++; + kfree_skb(skb); + return err; + } + + len = skb->len; + err = cl->qdisc->enqueue(skb, cl->qdisc); + if (unlikely(err != NET_XMIT_SUCCESS)) { + cl->qstats.drops++; + sch->qstats.drops++; + return err; + } + + if (cl->qdisc->q.qlen == 1) + set_active(cl, len); + + cl->bstats.packets++; + cl->bstats.bytes += len; + sch->bstats.packets++; + sch->bstats.bytes += len; + sch->q.qlen++; + + return NET_XMIT_SUCCESS; +} + +static struct sk_buff * +hfsc_dequeue(struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + struct sk_buff *skb; + u64 cur_time; + unsigned int next_len; + int realtime = 0; + + if (sch->q.qlen == 0) + return NULL; + if ((skb = __skb_dequeue(&q->requeue))) + goto out; + + PSCHED_GET_TIME(cur_time); + + /* + * if there are eligible classes, use real-time criteria. + * find the class with the minimum deadline among + * the eligible classes. + */ + if ((cl = eltree_get_mindl(q, cur_time)) != NULL) { + realtime = 1; + } else { + /* + * use link-sharing criteria + * get the class with the minimum vt in the hierarchy + */ + cl = vttree_get_minvt(&q->root, cur_time); + if (cl == NULL) { + sch->qstats.overlimits++; + hfsc_schedule_watchdog(sch, cur_time); + return NULL; + } + } + + skb = cl->qdisc->dequeue(cl->qdisc); + if (skb == NULL) { + if (net_ratelimit()) + printk("HFSC: Non-work-conserving qdisc ?\n"); + return NULL; + } + + update_vf(cl, skb->len, cur_time); + if (realtime) + cl->cl_cumul += skb->len; + + if (cl->qdisc->q.qlen != 0) { + if (cl->cl_flags & HFSC_RSC) { + /* update ed */ + next_len = qdisc_peek_len(cl->qdisc); + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } + } else { + /* the class becomes passive */ + set_passive(cl); + } + + out: + sch->flags &= ~TCQ_F_THROTTLED; + sch->q.qlen--; + + return skb; +} + +static int +hfsc_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + + __skb_queue_head(&q->requeue, skb); + sch->q.qlen++; + sch->qstats.requeues++; + return NET_XMIT_SUCCESS; +} + +static unsigned int +hfsc_drop(struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + unsigned int len; + + list_for_each_entry(cl, &q->droplist, dlist) { + if (cl->qdisc->ops->drop != NULL && + (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) { + if (cl->qdisc->q.qlen == 0) { + update_vf(cl, 0, 0); + set_passive(cl); + } else { + list_move_tail(&cl->dlist, &q->droplist); + } + cl->qstats.drops++; + sch->qstats.drops++; + sch->q.qlen--; + return len; + } + } + return 0; +} + +static struct Qdisc_class_ops hfsc_class_ops = { + .change = hfsc_change_class, + .delete = hfsc_delete_class, + .graft = hfsc_graft_class, + .leaf = hfsc_class_leaf, + .get = hfsc_get_class, + .put = hfsc_put_class, + .bind_tcf = hfsc_bind_tcf, + .unbind_tcf = hfsc_unbind_tcf, + .tcf_chain = hfsc_tcf_chain, + .dump = hfsc_dump_class, + .dump_stats = hfsc_dump_class_stats, + .walk = hfsc_walk +}; + +static struct Qdisc_ops hfsc_qdisc_ops = { + .id = "hfsc", + .init = hfsc_init_qdisc, + .change = hfsc_change_qdisc, + .reset = hfsc_reset_qdisc, + .destroy = hfsc_destroy_qdisc, + .dump = hfsc_dump_qdisc, + .enqueue = hfsc_enqueue, + .dequeue = hfsc_dequeue, + .requeue = hfsc_requeue, + .drop = hfsc_drop, + .cl_ops = &hfsc_class_ops, + .priv_size = sizeof(struct hfsc_sched), + .owner = THIS_MODULE +}; + +static int __init +hfsc_init(void) +{ + return register_qdisc(&hfsc_qdisc_ops); +} + +static void __exit +hfsc_cleanup(void) +{ + unregister_qdisc(&hfsc_qdisc_ops); +} + +MODULE_LICENSE("GPL"); +module_init(hfsc_init); +module_exit(hfsc_cleanup); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c new file mode 100644 index 00000000000..a85935e7d53 --- /dev/null +++ b/net/sched/sch_htb.c @@ -0,0 +1,1759 @@ +/* vim: ts=8 sw=8 + * net/sched/sch_htb.c Hierarchical token bucket, feed tree version + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Martin Devera, <devik@cdi.cz> + * + * Credits (in time order) for older HTB versions: + * Stef Coene <stef.coene@docum.org> + * HTB support at LARTC mailing list + * Ondrej Kraus, <krauso@barr.cz> + * found missing INIT_QDISC(htb) + * Vladimir Smelhaus, Aamer Akhter, Bert Hubert + * helped a lot to locate nasty class stall bug + * Andi Kleen, Jamal Hadi, Bert Hubert + * code review and helpful comments on shaping + * Tomasz Wrona, <tw@eter.tym.pl> + * created test case so that I was able to fix nasty bug + * Wilfried Weissmann + * spotted bug in dequeue code and helped with fix + * Jiri Fojtasek + * fixed requeue routine + * and many others. thanks. + * + * $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $ + */ +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <linux/list.h> +#include <linux/compiler.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <linux/rbtree.h> + +/* HTB algorithm. + Author: devik@cdi.cz + ======================================================================== + HTB is like TBF with multiple classes. It is also similar to CBQ because + it allows to assign priority to each class in hierarchy. + In fact it is another implementation of Floyd's formal sharing. + + Levels: + Each class is assigned level. Leaf has ALWAYS level 0 and root + classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level + one less than their parent. +*/ + +#define HTB_HSIZE 16 /* classid hash size */ +#define HTB_EWMAC 2 /* rate average over HTB_EWMAC*HTB_HSIZE sec */ +#undef HTB_DEBUG /* compile debugging support (activated by tc tool) */ +#define HTB_RATECM 1 /* whether to use rate computer */ +#define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */ +#define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock) +#define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock) +#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */ + +#if HTB_VER >> 16 != TC_HTB_PROTOVER +#error "Mismatched sch_htb.c and pkt_sch.h" +#endif + +/* debugging support; S is subsystem, these are defined: + 0 - netlink messages + 1 - enqueue + 2 - drop & requeue + 3 - dequeue main + 4 - dequeue one prio DRR part + 5 - dequeue class accounting + 6 - class overlimit status computation + 7 - hint tree + 8 - event queue + 10 - rate estimator + 11 - classifier + 12 - fast dequeue cache + + L is level; 0 = none, 1 = basic info, 2 = detailed, 3 = full + q->debug uint32 contains 16 2-bit fields one for subsystem starting + from LSB + */ +#ifdef HTB_DEBUG +#define HTB_DBG_COND(S,L) (((q->debug>>(2*S))&3) >= L) +#define HTB_DBG(S,L,FMT,ARG...) if (HTB_DBG_COND(S,L)) \ + printk(KERN_DEBUG FMT,##ARG) +#define HTB_CHCL(cl) BUG_TRAP((cl)->magic == HTB_CMAGIC) +#define HTB_PASSQ q, +#define HTB_ARGQ struct htb_sched *q, +#define static +#undef __inline__ +#define __inline__ +#undef inline +#define inline +#define HTB_CMAGIC 0xFEFAFEF1 +#define htb_safe_rb_erase(N,R) do { BUG_TRAP((N)->rb_color != -1); \ + if ((N)->rb_color == -1) break; \ + rb_erase(N,R); \ + (N)->rb_color = -1; } while (0) +#else +#define HTB_DBG_COND(S,L) (0) +#define HTB_DBG(S,L,FMT,ARG...) +#define HTB_PASSQ +#define HTB_ARGQ +#define HTB_CHCL(cl) +#define htb_safe_rb_erase(N,R) rb_erase(N,R) +#endif + + +/* used internaly to keep status of single class */ +enum htb_cmode { + HTB_CANT_SEND, /* class can't send and can't borrow */ + HTB_MAY_BORROW, /* class can't send but may borrow */ + HTB_CAN_SEND /* class can send */ +}; + +/* interior & leaf nodes; props specific to leaves are marked L: */ +struct htb_class +{ +#ifdef HTB_DEBUG + unsigned magic; +#endif + /* general class parameters */ + u32 classid; + struct gnet_stats_basic bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est rate_est; + struct tc_htb_xstats xstats;/* our special stats */ + int refcnt; /* usage count of this class */ + +#ifdef HTB_RATECM + /* rate measurement counters */ + unsigned long rate_bytes,sum_bytes; + unsigned long rate_packets,sum_packets; +#endif + + /* topology */ + int level; /* our level (see above) */ + struct htb_class *parent; /* parent class */ + struct list_head hlist; /* classid hash list item */ + struct list_head sibling; /* sibling list item */ + struct list_head children; /* children list */ + + union { + struct htb_class_leaf { + struct Qdisc *q; + int prio; + int aprio; + int quantum; + int deficit[TC_HTB_MAXDEPTH]; + struct list_head drop_list; + } leaf; + struct htb_class_inner { + struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */ + struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */ + /* When class changes from state 1->2 and disconnects from + parent's feed then we lost ptr value and start from the + first child again. Here we store classid of the + last valid ptr (used when ptr is NULL). */ + u32 last_ptr_id[TC_HTB_NUMPRIO]; + } inner; + } un; + struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ + struct rb_node pq_node; /* node for event queue */ + unsigned long pq_key; /* the same type as jiffies global */ + + int prio_activity; /* for which prios are we active */ + enum htb_cmode cmode; /* current mode of the class */ + + /* class attached filters */ + struct tcf_proto *filter_list; + int filter_cnt; + + int warned; /* only one warning about non work conserving .. */ + + /* token bucket parameters */ + struct qdisc_rate_table *rate; /* rate table of the class itself */ + struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ + long buffer,cbuffer; /* token bucket depth/rate */ + long mbuffer; /* max wait time */ + long tokens,ctokens; /* current number of tokens */ + psched_time_t t_c; /* checkpoint time */ +}; + +/* TODO: maybe compute rate when size is too large .. or drop ? */ +static __inline__ long L2T(struct htb_class *cl,struct qdisc_rate_table *rate, + int size) +{ + int slot = size >> rate->rate.cell_log; + if (slot > 255) { + cl->xstats.giants++; + slot = 255; + } + return rate->data[slot]; +} + +struct htb_sched +{ + struct list_head root; /* root classes list */ + struct list_head hash[HTB_HSIZE]; /* hashed by classid */ + struct list_head drops[TC_HTB_NUMPRIO]; /* active leaves (for drops) */ + + /* self list - roots of self generating tree */ + struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + int row_mask[TC_HTB_MAXDEPTH]; + struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + + /* self wait list - roots of wait PQs per row */ + struct rb_root wait_pq[TC_HTB_MAXDEPTH]; + + /* time of nearest event per level (row) */ + unsigned long near_ev_cache[TC_HTB_MAXDEPTH]; + + /* cached value of jiffies in dequeue */ + unsigned long jiffies; + + /* whether we hit non-work conserving class during this dequeue; we use */ + int nwc_hit; /* this to disable mindelay complaint in dequeue */ + + int defcls; /* class where unclassified flows go to */ + u32 debug; /* subsystem debug levels */ + + /* filters for qdisc itself */ + struct tcf_proto *filter_list; + int filter_cnt; + + int rate2quantum; /* quant = rate / rate2quantum */ + psched_time_t now; /* cached dequeue time */ + struct timer_list timer; /* send delay timer */ +#ifdef HTB_RATECM + struct timer_list rttim; /* rate computer timer */ + int recmp_bucket; /* which hash bucket to recompute next */ +#endif + + /* non shaped skbs; let them go directly thru */ + struct sk_buff_head direct_queue; + int direct_qlen; /* max qlen of above */ + + long direct_pkts; +}; + +/* compute hash of size HTB_HSIZE for given handle */ +static __inline__ int htb_hash(u32 h) +{ +#if HTB_HSIZE != 16 + #error "Declare new hash for your HTB_HSIZE" +#endif + h ^= h>>8; /* stolen from cbq_hash */ + h ^= h>>4; + return h & 0xf; +} + +/* find class in global hash table using given handle */ +static __inline__ struct htb_class *htb_find(u32 handle, struct Qdisc *sch) +{ + struct htb_sched *q = qdisc_priv(sch); + struct list_head *p; + if (TC_H_MAJ(handle) != sch->handle) + return NULL; + + list_for_each (p,q->hash+htb_hash(handle)) { + struct htb_class *cl = list_entry(p,struct htb_class,hlist); + if (cl->classid == handle) + return cl; + } + return NULL; +} + +/** + * htb_classify - classify a packet into class + * + * It returns NULL if the packet should be dropped or -1 if the packet + * should be passed directly thru. In all other cases leaf class is returned. + * We allow direct class selection by classid in priority. The we examine + * filters in qdisc and in inner nodes (if higher filter points to the inner + * node). If we end up with classid MAJOR:0 we enqueue the skb into special + * internal fifo (direct). These packets then go directly thru. If we still + * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull + * then finish and return direct queue. + */ +#define HTB_DIRECT (struct htb_class*)-1 +static inline u32 htb_classid(struct htb_class *cl) +{ + return (cl && cl != HTB_DIRECT) ? cl->classid : TC_H_UNSPEC; +} + +static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl; + struct tcf_result res; + struct tcf_proto *tcf; + int result; + + /* allow to select class by setting skb->priority to valid classid; + note that nfmark can be used too by attaching filter fw with no + rules in it */ + if (skb->priority == sch->handle) + return HTB_DIRECT; /* X:0 (direct flow) selected */ + if ((cl = htb_find(skb->priority,sch)) != NULL && cl->level == 0) + return cl; + + *qerr = NET_XMIT_DROP; + tcf = q->filter_list; + while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS; + case TC_ACT_SHOT: + return NULL; + } +#elif defined(CONFIG_NET_CLS_POLICE) + if (result == TC_POLICE_SHOT) + return HTB_DIRECT; +#endif + if ((cl = (void*)res.class) == NULL) { + if (res.classid == sch->handle) + return HTB_DIRECT; /* X:0 (direct flow) */ + if ((cl = htb_find(res.classid,sch)) == NULL) + break; /* filter selected invalid classid */ + } + if (!cl->level) + return cl; /* we hit leaf; return it */ + + /* we have got inner class; apply inner filter chain */ + tcf = cl->filter_list; + } + /* classification failed; try to use default class */ + cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle),q->defcls),sch); + if (!cl || cl->level) + return HTB_DIRECT; /* bad default .. this is safe bet */ + return cl; +} + +#ifdef HTB_DEBUG +static void htb_next_rb_node(struct rb_node **n); +#define HTB_DUMTREE(root,memb) if(root) { \ + struct rb_node *n = (root)->rb_node; \ + while (n->rb_left) n = n->rb_left; \ + while (n) { \ + struct htb_class *cl = rb_entry(n, struct htb_class, memb); \ + printk(" %x",cl->classid); htb_next_rb_node (&n); \ + } } + +static void htb_debug_dump (struct htb_sched *q) +{ + int i,p; + printk(KERN_DEBUG "htb*g j=%lu lj=%lu\n",jiffies,q->jiffies); + /* rows */ + for (i=TC_HTB_MAXDEPTH-1;i>=0;i--) { + printk(KERN_DEBUG "htb*r%d m=%x",i,q->row_mask[i]); + for (p=0;p<TC_HTB_NUMPRIO;p++) { + if (!q->row[i][p].rb_node) continue; + printk(" p%d:",p); + HTB_DUMTREE(q->row[i]+p,node[p]); + } + printk("\n"); + } + /* classes */ + for (i = 0; i < HTB_HSIZE; i++) { + struct list_head *l; + list_for_each (l,q->hash+i) { + struct htb_class *cl = list_entry(l,struct htb_class,hlist); + long diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer); + printk(KERN_DEBUG "htb*c%x m=%d t=%ld c=%ld pq=%lu df=%ld ql=%d " + "pa=%x f:", + cl->classid,cl->cmode,cl->tokens,cl->ctokens, + cl->pq_node.rb_color==-1?0:cl->pq_key,diff, + cl->level?0:cl->un.leaf.q->q.qlen,cl->prio_activity); + if (cl->level) + for (p=0;p<TC_HTB_NUMPRIO;p++) { + if (!cl->un.inner.feed[p].rb_node) continue; + printk(" p%d a=%x:",p,cl->un.inner.ptr[p]?rb_entry(cl->un.inner.ptr[p], struct htb_class,node[p])->classid:0); + HTB_DUMTREE(cl->un.inner.feed+p,node[p]); + } + printk("\n"); + } + } +} +#endif +/** + * htb_add_to_id_tree - adds class to the round robin list + * + * Routine adds class to the list (actually tree) sorted by classid. + * Make sure that class is not already on such list for given prio. + */ +static void htb_add_to_id_tree (HTB_ARGQ struct rb_root *root, + struct htb_class *cl,int prio) +{ + struct rb_node **p = &root->rb_node, *parent = NULL; + HTB_DBG(7,3,"htb_add_id_tree cl=%X prio=%d\n",cl->classid,prio); +#ifdef HTB_DEBUG + if (cl->node[prio].rb_color != -1) { BUG_TRAP(0); return; } + HTB_CHCL(cl); + if (*p) { + struct htb_class *x = rb_entry(*p,struct htb_class,node[prio]); + HTB_CHCL(x); + } +#endif + while (*p) { + struct htb_class *c; parent = *p; + c = rb_entry(parent, struct htb_class, node[prio]); + HTB_CHCL(c); + if (cl->classid > c->classid) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->node[prio], parent, p); + rb_insert_color(&cl->node[prio], root); +} + +/** + * htb_add_to_wait_tree - adds class to the event queue with delay + * + * The class is added to priority event queue to indicate that class will + * change its mode in cl->pq_key microseconds. Make sure that class is not + * already in the queue. + */ +static void htb_add_to_wait_tree (struct htb_sched *q, + struct htb_class *cl,long delay,int debug_hint) +{ + struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; + HTB_DBG(7,3,"htb_add_wt cl=%X key=%lu\n",cl->classid,cl->pq_key); +#ifdef HTB_DEBUG + if (cl->pq_node.rb_color != -1) { BUG_TRAP(0); return; } + HTB_CHCL(cl); + if ((delay <= 0 || delay > cl->mbuffer) && net_ratelimit()) + printk(KERN_ERR "HTB: suspicious delay in wait_tree d=%ld cl=%X h=%d\n",delay,cl->classid,debug_hint); +#endif + cl->pq_key = q->jiffies + PSCHED_US2JIFFIE(delay); + if (cl->pq_key == q->jiffies) + cl->pq_key++; + + /* update the nearest event cache */ + if (time_after(q->near_ev_cache[cl->level], cl->pq_key)) + q->near_ev_cache[cl->level] = cl->pq_key; + + while (*p) { + struct htb_class *c; parent = *p; + c = rb_entry(parent, struct htb_class, pq_node); + if (time_after_eq(cl->pq_key, c->pq_key)) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->pq_node, parent, p); + rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]); +} + +/** + * htb_next_rb_node - finds next node in binary tree + * + * When we are past last key we return NULL. + * Average complexity is 2 steps per call. + */ +static void htb_next_rb_node(struct rb_node **n) +{ + *n = rb_next(*n); +} + +/** + * htb_add_class_to_row - add class to its row + * + * The class is added to row at priorities marked in mask. + * It does nothing if mask == 0. + */ +static inline void htb_add_class_to_row(struct htb_sched *q, + struct htb_class *cl,int mask) +{ + HTB_DBG(7,2,"htb_addrow cl=%X mask=%X rmask=%X\n", + cl->classid,mask,q->row_mask[cl->level]); + HTB_CHCL(cl); + q->row_mask[cl->level] |= mask; + while (mask) { + int prio = ffz(~mask); + mask &= ~(1 << prio); + htb_add_to_id_tree(HTB_PASSQ q->row[cl->level]+prio,cl,prio); + } +} + +/** + * htb_remove_class_from_row - removes class from its row + * + * The class is removed from row at priorities marked in mask. + * It does nothing if mask == 0. + */ +static __inline__ void htb_remove_class_from_row(struct htb_sched *q, + struct htb_class *cl,int mask) +{ + int m = 0; + HTB_CHCL(cl); + while (mask) { + int prio = ffz(~mask); + mask &= ~(1 << prio); + if (q->ptr[cl->level][prio] == cl->node+prio) + htb_next_rb_node(q->ptr[cl->level]+prio); + htb_safe_rb_erase(cl->node + prio,q->row[cl->level]+prio); + if (!q->row[cl->level][prio].rb_node) + m |= 1 << prio; + } + HTB_DBG(7,2,"htb_delrow cl=%X mask=%X rmask=%X maskdel=%X\n", + cl->classid,mask,q->row_mask[cl->level],m); + q->row_mask[cl->level] &= ~m; +} + +/** + * htb_activate_prios - creates active classe's feed chain + * + * The class is connected to ancestors and/or appropriate rows + * for priorities it is participating on. cl->cmode must be new + * (activated) mode. It does nothing if cl->prio_activity == 0. + */ +static void htb_activate_prios(struct htb_sched *q,struct htb_class *cl) +{ + struct htb_class *p = cl->parent; + long m,mask = cl->prio_activity; + HTB_DBG(7,2,"htb_act_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode); + HTB_CHCL(cl); + + while (cl->cmode == HTB_MAY_BORROW && p && mask) { + HTB_CHCL(p); + m = mask; while (m) { + int prio = ffz(~m); + m &= ~(1 << prio); + + if (p->un.inner.feed[prio].rb_node) + /* parent already has its feed in use so that + reset bit in mask as parent is already ok */ + mask &= ~(1 << prio); + + htb_add_to_id_tree(HTB_PASSQ p->un.inner.feed+prio,cl,prio); + } + HTB_DBG(7,3,"htb_act_pr_aft p=%X pact=%X mask=%lX pmode=%d\n", + p->classid,p->prio_activity,mask,p->cmode); + p->prio_activity |= mask; + cl = p; p = cl->parent; + HTB_CHCL(cl); + } + if (cl->cmode == HTB_CAN_SEND && mask) + htb_add_class_to_row(q,cl,mask); +} + +/** + * htb_deactivate_prios - remove class from feed chain + * + * cl->cmode must represent old mode (before deactivation). It does + * nothing if cl->prio_activity == 0. Class is removed from all feed + * chains and rows. + */ +static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) +{ + struct htb_class *p = cl->parent; + long m,mask = cl->prio_activity; + HTB_DBG(7,2,"htb_deact_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode); + HTB_CHCL(cl); + + while (cl->cmode == HTB_MAY_BORROW && p && mask) { + m = mask; mask = 0; + while (m) { + int prio = ffz(~m); + m &= ~(1 << prio); + + if (p->un.inner.ptr[prio] == cl->node+prio) { + /* we are removing child which is pointed to from + parent feed - forget the pointer but remember + classid */ + p->un.inner.last_ptr_id[prio] = cl->classid; + p->un.inner.ptr[prio] = NULL; + } + + htb_safe_rb_erase(cl->node + prio,p->un.inner.feed + prio); + + if (!p->un.inner.feed[prio].rb_node) + mask |= 1 << prio; + } + HTB_DBG(7,3,"htb_deact_pr_aft p=%X pact=%X mask=%lX pmode=%d\n", + p->classid,p->prio_activity,mask,p->cmode); + p->prio_activity &= ~mask; + cl = p; p = cl->parent; + HTB_CHCL(cl); + } + if (cl->cmode == HTB_CAN_SEND && mask) + htb_remove_class_from_row(q,cl,mask); +} + +/** + * htb_class_mode - computes and returns current class mode + * + * It computes cl's mode at time cl->t_c+diff and returns it. If mode + * is not HTB_CAN_SEND then cl->pq_key is updated to time difference + * from now to time when cl will change its state. + * Also it is worth to note that class mode doesn't change simply + * at cl->{c,}tokens == 0 but there can rather be hysteresis of + * 0 .. -cl->{c,}buffer range. It is meant to limit number of + * mode transitions per time unit. The speed gain is about 1/6. + */ +static __inline__ enum htb_cmode +htb_class_mode(struct htb_class *cl,long *diff) +{ + long toks; + + if ((toks = (cl->ctokens + *diff)) < ( +#if HTB_HYSTERESIS + cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : +#endif + 0)) { + *diff = -toks; + return HTB_CANT_SEND; + } + if ((toks = (cl->tokens + *diff)) >= ( +#if HTB_HYSTERESIS + cl->cmode == HTB_CAN_SEND ? -cl->buffer : +#endif + 0)) + return HTB_CAN_SEND; + + *diff = -toks; + return HTB_MAY_BORROW; +} + +/** + * htb_change_class_mode - changes classe's mode + * + * This should be the only way how to change classe's mode under normal + * cirsumstances. Routine will update feed lists linkage, change mode + * and add class to the wait event queue if appropriate. New mode should + * be different from old one and cl->pq_key has to be valid if changing + * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). + */ +static void +htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) +{ + enum htb_cmode new_mode = htb_class_mode(cl,diff); + + HTB_CHCL(cl); + HTB_DBG(7,1,"htb_chging_clmode %d->%d cl=%X\n",cl->cmode,new_mode,cl->classid); + + if (new_mode == cl->cmode) + return; + + if (cl->prio_activity) { /* not necessary: speed optimization */ + if (cl->cmode != HTB_CANT_SEND) + htb_deactivate_prios(q,cl); + cl->cmode = new_mode; + if (new_mode != HTB_CANT_SEND) + htb_activate_prios(q,cl); + } else + cl->cmode = new_mode; +} + +/** + * htb_activate - inserts leaf cl into appropriate active feeds + * + * Routine learns (new) priority of leaf and activates feed chain + * for the prio. It can be called on already active leaf safely. + * It also adds leaf into droplist. + */ +static __inline__ void htb_activate(struct htb_sched *q,struct htb_class *cl) +{ + BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen); + HTB_CHCL(cl); + if (!cl->prio_activity) { + cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio); + htb_activate_prios(q,cl); + list_add_tail(&cl->un.leaf.drop_list,q->drops+cl->un.leaf.aprio); + } +} + +/** + * htb_deactivate - remove leaf cl from active feeds + * + * Make sure that leaf is active. In the other words it can't be called + * with non-active leaf. It also removes class from the drop list. + */ +static __inline__ void +htb_deactivate(struct htb_sched *q,struct htb_class *cl) +{ + BUG_TRAP(cl->prio_activity); + HTB_CHCL(cl); + htb_deactivate_prios(q,cl); + cl->prio_activity = 0; + list_del_init(&cl->un.leaf.drop_list); +} + +static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + int ret; + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = htb_classify(skb,sch,&ret); + + if (cl == HTB_DIRECT) { + /* enqueue to helper queue */ + if (q->direct_queue.qlen < q->direct_qlen) { + __skb_queue_tail(&q->direct_queue, skb); + q->direct_pkts++; + } +#ifdef CONFIG_NET_CLS_ACT + } else if (!cl) { + if (ret == NET_XMIT_DROP) + sch->qstats.drops++; + kfree_skb (skb); + return ret; +#endif + } else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { + sch->qstats.drops++; + cl->qstats.drops++; + return NET_XMIT_DROP; + } else { + cl->bstats.packets++; cl->bstats.bytes += skb->len; + htb_activate (q,cl); + } + + sch->q.qlen++; + sch->bstats.packets++; sch->bstats.bytes += skb->len; + HTB_DBG(1,1,"htb_enq_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb); + return NET_XMIT_SUCCESS; +} + +/* TODO: requeuing packet charges it to policers again !! */ +static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct htb_sched *q = qdisc_priv(sch); + int ret = NET_XMIT_SUCCESS; + struct htb_class *cl = htb_classify(skb,sch, &ret); + struct sk_buff *tskb; + + if (cl == HTB_DIRECT || !cl) { + /* enqueue to helper queue */ + if (q->direct_queue.qlen < q->direct_qlen && cl) { + __skb_queue_head(&q->direct_queue, skb); + } else { + __skb_queue_head(&q->direct_queue, skb); + tskb = __skb_dequeue_tail(&q->direct_queue); + kfree_skb (tskb); + sch->qstats.drops++; + return NET_XMIT_CN; + } + } else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { + sch->qstats.drops++; + cl->qstats.drops++; + return NET_XMIT_DROP; + } else + htb_activate (q,cl); + + sch->q.qlen++; + sch->qstats.requeues++; + HTB_DBG(1,1,"htb_req_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb); + return NET_XMIT_SUCCESS; +} + +static void htb_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + sch->flags &= ~TCQ_F_THROTTLED; + wmb(); + netif_schedule(sch->dev); +} + +#ifdef HTB_RATECM +#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0 +static void htb_rate_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct htb_sched *q = qdisc_priv(sch); + struct list_head *p; + + /* lock queue so that we can muck with it */ + HTB_QLOCK(sch); + HTB_DBG(10,1,"htb_rttmr j=%ld\n",jiffies); + + q->rttim.expires = jiffies + HZ; + add_timer(&q->rttim); + + /* scan and recompute one bucket at time */ + if (++q->recmp_bucket >= HTB_HSIZE) + q->recmp_bucket = 0; + list_for_each (p,q->hash+q->recmp_bucket) { + struct htb_class *cl = list_entry(p,struct htb_class,hlist); + HTB_DBG(10,2,"htb_rttmr_cl cl=%X sbyte=%lu spkt=%lu\n", + cl->classid,cl->sum_bytes,cl->sum_packets); + RT_GEN (cl->sum_bytes,cl->rate_bytes); + RT_GEN (cl->sum_packets,cl->rate_packets); + } + HTB_QUNLOCK(sch); +} +#endif + +/** + * htb_charge_class - charges amount "bytes" to leaf and ancestors + * + * Routine assumes that packet "bytes" long was dequeued from leaf cl + * borrowing from "level". It accounts bytes to ceil leaky bucket for + * leaf and all ancestors and to rate bucket for ancestors at levels + * "level" and higher. It also handles possible change of mode resulting + * from the update. Note that mode can also increase here (MAY_BORROW to + * CAN_SEND) because we can use more precise clock that event queue here. + * In such case we remove class from event queue first. + */ +static void htb_charge_class(struct htb_sched *q,struct htb_class *cl, + int level,int bytes) +{ + long toks,diff; + enum htb_cmode old_mode; + HTB_DBG(5,1,"htb_chrg_cl cl=%X lev=%d len=%d\n",cl->classid,level,bytes); + +#define HTB_ACCNT(T,B,R) toks = diff + cl->T; \ + if (toks > cl->B) toks = cl->B; \ + toks -= L2T(cl, cl->R, bytes); \ + if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \ + cl->T = toks + + while (cl) { + HTB_CHCL(cl); + diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer); +#ifdef HTB_DEBUG + if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) { + if (net_ratelimit()) + printk(KERN_ERR "HTB: bad diff in charge, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n", + cl->classid, diff, +#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY + q->now.tv_sec * 1000000ULL + q->now.tv_usec, + cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec, +#else + (unsigned long long) q->now, + (unsigned long long) cl->t_c, +#endif + q->jiffies); + diff = 1000; + } +#endif + if (cl->level >= level) { + if (cl->level == level) cl->xstats.lends++; + HTB_ACCNT (tokens,buffer,rate); + } else { + cl->xstats.borrows++; + cl->tokens += diff; /* we moved t_c; update tokens */ + } + HTB_ACCNT (ctokens,cbuffer,ceil); + cl->t_c = q->now; + HTB_DBG(5,2,"htb_chrg_clp cl=%X diff=%ld tok=%ld ctok=%ld\n",cl->classid,diff,cl->tokens,cl->ctokens); + + old_mode = cl->cmode; diff = 0; + htb_change_class_mode(q,cl,&diff); + if (old_mode != cl->cmode) { + if (old_mode != HTB_CAN_SEND) + htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level); + if (cl->cmode != HTB_CAN_SEND) + htb_add_to_wait_tree (q,cl,diff,1); + } + +#ifdef HTB_RATECM + /* update rate counters */ + cl->sum_bytes += bytes; cl->sum_packets++; +#endif + + /* update byte stats except for leaves which are already updated */ + if (cl->level) { + cl->bstats.bytes += bytes; + cl->bstats.packets++; + } + cl = cl->parent; + } +} + +/** + * htb_do_events - make mode changes to classes at the level + * + * Scans event queue for pending events and applies them. Returns jiffies to + * next pending event (0 for no event in pq). + * Note: Aplied are events whose have cl->pq_key <= jiffies. + */ +static long htb_do_events(struct htb_sched *q,int level) +{ + int i; + HTB_DBG(8,1,"htb_do_events l=%d root=%p rmask=%X\n", + level,q->wait_pq[level].rb_node,q->row_mask[level]); + for (i = 0; i < 500; i++) { + struct htb_class *cl; + long diff; + struct rb_node *p = q->wait_pq[level].rb_node; + if (!p) return 0; + while (p->rb_left) p = p->rb_left; + + cl = rb_entry(p, struct htb_class, pq_node); + if (time_after(cl->pq_key, q->jiffies)) { + HTB_DBG(8,3,"htb_do_ev_ret delay=%ld\n",cl->pq_key - q->jiffies); + return cl->pq_key - q->jiffies; + } + htb_safe_rb_erase(p,q->wait_pq+level); + diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer); +#ifdef HTB_DEBUG + if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) { + if (net_ratelimit()) + printk(KERN_ERR "HTB: bad diff in events, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n", + cl->classid, diff, +#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY + q->now.tv_sec * 1000000ULL + q->now.tv_usec, + cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec, +#else + (unsigned long long) q->now, + (unsigned long long) cl->t_c, +#endif + q->jiffies); + diff = 1000; + } +#endif + htb_change_class_mode(q,cl,&diff); + if (cl->cmode != HTB_CAN_SEND) + htb_add_to_wait_tree (q,cl,diff,2); + } + if (net_ratelimit()) + printk(KERN_WARNING "htb: too many events !\n"); + return HZ/10; +} + +/* Returns class->node+prio from id-tree where classe's id is >= id. NULL + is no such one exists. */ +static struct rb_node * +htb_id_find_next_upper(int prio,struct rb_node *n,u32 id) +{ + struct rb_node *r = NULL; + while (n) { + struct htb_class *cl = rb_entry(n,struct htb_class,node[prio]); + if (id == cl->classid) return n; + + if (id > cl->classid) { + n = n->rb_right; + } else { + r = n; + n = n->rb_left; + } + } + return r; +} + +/** + * htb_lookup_leaf - returns next leaf class in DRR order + * + * Find leaf where current feed pointers points to. + */ +static struct htb_class * +htb_lookup_leaf(HTB_ARGQ struct rb_root *tree,int prio,struct rb_node **pptr,u32 *pid) +{ + int i; + struct { + struct rb_node *root; + struct rb_node **pptr; + u32 *pid; + } stk[TC_HTB_MAXDEPTH],*sp = stk; + + BUG_TRAP(tree->rb_node); + sp->root = tree->rb_node; + sp->pptr = pptr; + sp->pid = pid; + + for (i = 0; i < 65535; i++) { + HTB_DBG(4,2,"htb_lleaf ptr=%p pid=%X\n",*sp->pptr,*sp->pid); + + if (!*sp->pptr && *sp->pid) { + /* ptr was invalidated but id is valid - try to recover + the original or next ptr */ + *sp->pptr = htb_id_find_next_upper(prio,sp->root,*sp->pid); + } + *sp->pid = 0; /* ptr is valid now so that remove this hint as it + can become out of date quickly */ + if (!*sp->pptr) { /* we are at right end; rewind & go up */ + *sp->pptr = sp->root; + while ((*sp->pptr)->rb_left) + *sp->pptr = (*sp->pptr)->rb_left; + if (sp > stk) { + sp--; + BUG_TRAP(*sp->pptr); if(!*sp->pptr) return NULL; + htb_next_rb_node (sp->pptr); + } + } else { + struct htb_class *cl; + cl = rb_entry(*sp->pptr,struct htb_class,node[prio]); + HTB_CHCL(cl); + if (!cl->level) + return cl; + (++sp)->root = cl->un.inner.feed[prio].rb_node; + sp->pptr = cl->un.inner.ptr+prio; + sp->pid = cl->un.inner.last_ptr_id+prio; + } + } + BUG_TRAP(0); + return NULL; +} + +/* dequeues packet at given priority and level; call only if + you are sure that there is active class at prio/level */ +static struct sk_buff * +htb_dequeue_tree(struct htb_sched *q,int prio,int level) +{ + struct sk_buff *skb = NULL; + struct htb_class *cl,*start; + /* look initial class up in the row */ + start = cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio, + q->ptr[level]+prio,q->last_ptr_id[level]+prio); + + do { +next: + BUG_TRAP(cl); + if (!cl) return NULL; + HTB_DBG(4,1,"htb_deq_tr prio=%d lev=%d cl=%X defic=%d\n", + prio,level,cl->classid,cl->un.leaf.deficit[level]); + + /* class can be empty - it is unlikely but can be true if leaf + qdisc drops packets in enqueue routine or if someone used + graft operation on the leaf since last dequeue; + simply deactivate and skip such class */ + if (unlikely(cl->un.leaf.q->q.qlen == 0)) { + struct htb_class *next; + htb_deactivate(q,cl); + + /* row/level might become empty */ + if ((q->row_mask[level] & (1 << prio)) == 0) + return NULL; + + next = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio, + prio,q->ptr[level]+prio,q->last_ptr_id[level]+prio); + + if (cl == start) /* fix start if we just deleted it */ + start = next; + cl = next; + goto next; + } + + if (likely((skb = cl->un.leaf.q->dequeue(cl->un.leaf.q)) != NULL)) + break; + if (!cl->warned) { + printk(KERN_WARNING "htb: class %X isn't work conserving ?!\n",cl->classid); + cl->warned = 1; + } + q->nwc_hit++; + htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio); + cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio,q->ptr[level]+prio, + q->last_ptr_id[level]+prio); + + } while (cl != start); + + if (likely(skb != NULL)) { + if ((cl->un.leaf.deficit[level] -= skb->len) < 0) { + HTB_DBG(4,2,"htb_next_cl oldptr=%p quant_add=%d\n", + level?cl->parent->un.inner.ptr[prio]:q->ptr[0][prio],cl->un.leaf.quantum); + cl->un.leaf.deficit[level] += cl->un.leaf.quantum; + htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio); + } + /* this used to be after charge_class but this constelation + gives us slightly better performance */ + if (!cl->un.leaf.q->q.qlen) + htb_deactivate (q,cl); + htb_charge_class (q,cl,level,skb->len); + } + return skb; +} + +static void htb_delay_by(struct Qdisc *sch,long delay) +{ + struct htb_sched *q = qdisc_priv(sch); + if (delay <= 0) delay = 1; + if (unlikely(delay > 5*HZ)) { + if (net_ratelimit()) + printk(KERN_INFO "HTB delay %ld > 5sec\n", delay); + delay = 5*HZ; + } + /* why don't use jiffies here ? because expires can be in past */ + mod_timer(&q->timer, q->jiffies + delay); + sch->flags |= TCQ_F_THROTTLED; + sch->qstats.overlimits++; + HTB_DBG(3,1,"htb_deq t_delay=%ld\n",delay); +} + +static struct sk_buff *htb_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb = NULL; + struct htb_sched *q = qdisc_priv(sch); + int level; + long min_delay; +#ifdef HTB_DEBUG + int evs_used = 0; +#endif + + q->jiffies = jiffies; + HTB_DBG(3,1,"htb_deq dircnt=%d qlen=%d\n",skb_queue_len(&q->direct_queue), + sch->q.qlen); + + /* try to dequeue direct packets as high prio (!) to minimize cpu work */ + if ((skb = __skb_dequeue(&q->direct_queue)) != NULL) { + sch->flags &= ~TCQ_F_THROTTLED; + sch->q.qlen--; + return skb; + } + + if (!sch->q.qlen) goto fin; + PSCHED_GET_TIME(q->now); + + min_delay = LONG_MAX; + q->nwc_hit = 0; + for (level = 0; level < TC_HTB_MAXDEPTH; level++) { + /* common case optimization - skip event handler quickly */ + int m; + long delay; + if (time_after_eq(q->jiffies, q->near_ev_cache[level])) { + delay = htb_do_events(q,level); + q->near_ev_cache[level] = q->jiffies + (delay ? delay : HZ); +#ifdef HTB_DEBUG + evs_used++; +#endif + } else + delay = q->near_ev_cache[level] - q->jiffies; + + if (delay && min_delay > delay) + min_delay = delay; + m = ~q->row_mask[level]; + while (m != (int)(-1)) { + int prio = ffz (m); + m |= 1 << prio; + skb = htb_dequeue_tree(q,prio,level); + if (likely(skb != NULL)) { + sch->q.qlen--; + sch->flags &= ~TCQ_F_THROTTLED; + goto fin; + } + } + } +#ifdef HTB_DEBUG + if (!q->nwc_hit && min_delay >= 10*HZ && net_ratelimit()) { + if (min_delay == LONG_MAX) { + printk(KERN_ERR "HTB: dequeue bug (%d,%lu,%lu), report it please !\n", + evs_used,q->jiffies,jiffies); + htb_debug_dump(q); + } else + printk(KERN_WARNING "HTB: mindelay=%ld, some class has " + "too small rate\n",min_delay); + } +#endif + htb_delay_by (sch,min_delay > 5*HZ ? 5*HZ : min_delay); +fin: + HTB_DBG(3,1,"htb_deq_end %s j=%lu skb=%p\n",sch->dev->name,q->jiffies,skb); + return skb; +} + +/* try to drop from each class (by prio) until one succeed */ +static unsigned int htb_drop(struct Qdisc* sch) +{ + struct htb_sched *q = qdisc_priv(sch); + int prio; + + for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) { + struct list_head *p; + list_for_each (p,q->drops+prio) { + struct htb_class *cl = list_entry(p, struct htb_class, + un.leaf.drop_list); + unsigned int len; + if (cl->un.leaf.q->ops->drop && + (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { + sch->q.qlen--; + if (!cl->un.leaf.q->q.qlen) + htb_deactivate (q,cl); + return len; + } + } + } + return 0; +} + +/* reset all classes */ +/* always caled under BH & queue lock */ +static void htb_reset(struct Qdisc* sch) +{ + struct htb_sched *q = qdisc_priv(sch); + int i; + HTB_DBG(0,1,"htb_reset sch=%p, handle=%X\n",sch,sch->handle); + + for (i = 0; i < HTB_HSIZE; i++) { + struct list_head *p; + list_for_each (p,q->hash+i) { + struct htb_class *cl = list_entry(p,struct htb_class,hlist); + if (cl->level) + memset(&cl->un.inner,0,sizeof(cl->un.inner)); + else { + if (cl->un.leaf.q) + qdisc_reset(cl->un.leaf.q); + INIT_LIST_HEAD(&cl->un.leaf.drop_list); + } + cl->prio_activity = 0; + cl->cmode = HTB_CAN_SEND; +#ifdef HTB_DEBUG + cl->pq_node.rb_color = -1; + memset(cl->node,255,sizeof(cl->node)); +#endif + + } + } + sch->flags &= ~TCQ_F_THROTTLED; + del_timer(&q->timer); + __skb_queue_purge(&q->direct_queue); + sch->q.qlen = 0; + memset(q->row,0,sizeof(q->row)); + memset(q->row_mask,0,sizeof(q->row_mask)); + memset(q->wait_pq,0,sizeof(q->wait_pq)); + memset(q->ptr,0,sizeof(q->ptr)); + for (i = 0; i < TC_HTB_NUMPRIO; i++) + INIT_LIST_HEAD(q->drops+i); +} + +static int htb_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct htb_sched *q = qdisc_priv(sch); + struct rtattr *tb[TCA_HTB_INIT]; + struct tc_htb_glob *gopt; + int i; +#ifdef HTB_DEBUG + printk(KERN_INFO "HTB init, kernel part version %d.%d\n", + HTB_VER >> 16,HTB_VER & 0xffff); +#endif + if (!opt || rtattr_parse_nested(tb, TCA_HTB_INIT, opt) || + tb[TCA_HTB_INIT-1] == NULL || + RTA_PAYLOAD(tb[TCA_HTB_INIT-1]) < sizeof(*gopt)) { + printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n"); + return -EINVAL; + } + gopt = RTA_DATA(tb[TCA_HTB_INIT-1]); + if (gopt->version != HTB_VER >> 16) { + printk(KERN_ERR "HTB: need tc/htb version %d (minor is %d), you have %d\n", + HTB_VER >> 16,HTB_VER & 0xffff,gopt->version); + return -EINVAL; + } + q->debug = gopt->debug; + HTB_DBG(0,1,"htb_init sch=%p handle=%X r2q=%d\n",sch,sch->handle,gopt->rate2quantum); + + INIT_LIST_HEAD(&q->root); + for (i = 0; i < HTB_HSIZE; i++) + INIT_LIST_HEAD(q->hash+i); + for (i = 0; i < TC_HTB_NUMPRIO; i++) + INIT_LIST_HEAD(q->drops+i); + + init_timer(&q->timer); + skb_queue_head_init(&q->direct_queue); + + q->direct_qlen = sch->dev->tx_queue_len; + if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ + q->direct_qlen = 2; + q->timer.function = htb_timer; + q->timer.data = (unsigned long)sch; + +#ifdef HTB_RATECM + init_timer(&q->rttim); + q->rttim.function = htb_rate_timer; + q->rttim.data = (unsigned long)sch; + q->rttim.expires = jiffies + HZ; + add_timer(&q->rttim); +#endif + if ((q->rate2quantum = gopt->rate2quantum) < 1) + q->rate2quantum = 1; + q->defcls = gopt->defcls; + + return 0; +} + +static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct htb_sched *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_htb_glob gopt; + HTB_DBG(0,1,"htb_dump sch=%p, handle=%X\n",sch,sch->handle); + HTB_QLOCK(sch); + gopt.direct_pkts = q->direct_pkts; + +#ifdef HTB_DEBUG + if (HTB_DBG_COND(0,2)) + htb_debug_dump(q); +#endif + gopt.version = HTB_VER; + gopt.rate2quantum = q->rate2quantum; + gopt.defcls = q->defcls; + gopt.debug = q->debug; + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); + rta->rta_len = skb->tail - b; + HTB_QUNLOCK(sch); + return skb->len; +rtattr_failure: + HTB_QUNLOCK(sch); + skb_trim(skb, skb->tail - skb->data); + return -1; +} + +static int htb_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ +#ifdef HTB_DEBUG + struct htb_sched *q = qdisc_priv(sch); +#endif + struct htb_class *cl = (struct htb_class*)arg; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_htb_opt opt; + + HTB_DBG(0,1,"htb_dump_class handle=%X clid=%X\n",sch->handle,cl->classid); + + HTB_QLOCK(sch); + tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT; + tcm->tcm_handle = cl->classid; + if (!cl->level && cl->un.leaf.q) + tcm->tcm_info = cl->un.leaf.q->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + memset (&opt,0,sizeof(opt)); + + opt.rate = cl->rate->rate; opt.buffer = cl->buffer; + opt.ceil = cl->ceil->rate; opt.cbuffer = cl->cbuffer; + opt.quantum = cl->un.leaf.quantum; opt.prio = cl->un.leaf.prio; + opt.level = cl->level; + RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + HTB_QUNLOCK(sch); + return skb->len; +rtattr_failure: + HTB_QUNLOCK(sch); + skb_trim(skb, b - skb->data); + return -1; +} + +static int +htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct htb_class *cl = (struct htb_class*)arg; + +#ifdef HTB_RATECM + cl->rate_est.bps = cl->rate_bytes/(HTB_EWMAC*HTB_HSIZE); + cl->rate_est.pps = cl->rate_packets/(HTB_EWMAC*HTB_HSIZE); +#endif + + if (!cl->level && cl->un.leaf.q) + cl->qstats.qlen = cl->un.leaf.q->q.qlen; + cl->xstats.tokens = cl->tokens; + cl->xstats.ctokens = cl->ctokens; + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, &cl->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); +} + +static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct htb_class *cl = (struct htb_class*)arg; + + if (cl && !cl->level) { + if (new == NULL && (new = qdisc_create_dflt(sch->dev, + &pfifo_qdisc_ops)) == NULL) + return -ENOBUFS; + sch_tree_lock(sch); + if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) { + if (cl->prio_activity) + htb_deactivate (qdisc_priv(sch),cl); + + /* TODO: is it correct ? Why CBQ doesn't do it ? */ + sch->q.qlen -= (*old)->q.qlen; + qdisc_reset(*old); + } + sch_tree_unlock(sch); + return 0; + } + return -ENOENT; +} + +static struct Qdisc * htb_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct htb_class *cl = (struct htb_class*)arg; + return (cl && !cl->level) ? cl->un.leaf.q : NULL; +} + +static unsigned long htb_get(struct Qdisc *sch, u32 classid) +{ +#ifdef HTB_DEBUG + struct htb_sched *q = qdisc_priv(sch); +#endif + struct htb_class *cl = htb_find(classid,sch); + HTB_DBG(0,1,"htb_get clid=%X q=%p cl=%p ref=%d\n",classid,q,cl,cl?cl->refcnt:0); + if (cl) + cl->refcnt++; + return (unsigned long)cl; +} + +static void htb_destroy_filters(struct tcf_proto **fl) +{ + struct tcf_proto *tp; + + while ((tp = *fl) != NULL) { + *fl = tp->next; + tcf_destroy(tp); + } +} + +static void htb_destroy_class(struct Qdisc* sch,struct htb_class *cl) +{ + struct htb_sched *q = qdisc_priv(sch); + HTB_DBG(0,1,"htb_destrycls clid=%X ref=%d\n", cl?cl->classid:0,cl?cl->refcnt:0); + if (!cl->level) { + BUG_TRAP(cl->un.leaf.q); + sch->q.qlen -= cl->un.leaf.q->q.qlen; + qdisc_destroy(cl->un.leaf.q); + } + qdisc_put_rtab(cl->rate); + qdisc_put_rtab(cl->ceil); + + htb_destroy_filters (&cl->filter_list); + + while (!list_empty(&cl->children)) + htb_destroy_class (sch,list_entry(cl->children.next, + struct htb_class,sibling)); + + /* note: this delete may happen twice (see htb_delete) */ + list_del(&cl->hlist); + list_del(&cl->sibling); + + if (cl->prio_activity) + htb_deactivate (q,cl); + + if (cl->cmode != HTB_CAN_SEND) + htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level); + + kfree(cl); +} + +/* always caled under BH & queue lock */ +static void htb_destroy(struct Qdisc* sch) +{ + struct htb_sched *q = qdisc_priv(sch); + HTB_DBG(0,1,"htb_destroy q=%p\n",q); + + del_timer_sync (&q->timer); +#ifdef HTB_RATECM + del_timer_sync (&q->rttim); +#endif + /* This line used to be after htb_destroy_class call below + and surprisingly it worked in 2.4. But it must precede it + because filter need its target class alive to be able to call + unbind_filter on it (without Oops). */ + htb_destroy_filters(&q->filter_list); + + while (!list_empty(&q->root)) + htb_destroy_class (sch,list_entry(q->root.next, + struct htb_class,sibling)); + + __skb_queue_purge(&q->direct_queue); +} + +static int htb_delete(struct Qdisc *sch, unsigned long arg) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = (struct htb_class*)arg; + HTB_DBG(0,1,"htb_delete q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0); + + // TODO: why don't allow to delete subtree ? references ? does + // tc subsys quarantee us that in htb_destroy it holds no class + // refs so that we can remove children safely there ? + if (!list_empty(&cl->children) || cl->filter_cnt) + return -EBUSY; + + sch_tree_lock(sch); + + /* delete from hash and active; remainder in destroy_class */ + list_del_init(&cl->hlist); + if (cl->prio_activity) + htb_deactivate (q,cl); + + if (--cl->refcnt == 0) + htb_destroy_class(sch,cl); + + sch_tree_unlock(sch); + return 0; +} + +static void htb_put(struct Qdisc *sch, unsigned long arg) +{ +#ifdef HTB_DEBUG + struct htb_sched *q = qdisc_priv(sch); +#endif + struct htb_class *cl = (struct htb_class*)arg; + HTB_DBG(0,1,"htb_put q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0); + + if (--cl->refcnt == 0) + htb_destroy_class(sch,cl); +} + +static int htb_change_class(struct Qdisc *sch, u32 classid, + u32 parentid, struct rtattr **tca, unsigned long *arg) +{ + int err = -EINVAL; + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = (struct htb_class*)*arg,*parent; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct qdisc_rate_table *rtab = NULL, *ctab = NULL; + struct rtattr *tb[TCA_HTB_RTAB]; + struct tc_htb_opt *hopt; + + /* extract all subattrs from opt attr */ + if (!opt || rtattr_parse_nested(tb, TCA_HTB_RTAB, opt) || + tb[TCA_HTB_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_HTB_PARMS-1]) < sizeof(*hopt)) + goto failure; + + parent = parentid == TC_H_ROOT ? NULL : htb_find (parentid,sch); + + hopt = RTA_DATA(tb[TCA_HTB_PARMS-1]); + HTB_DBG(0,1,"htb_chg cl=%p(%X), clid=%X, parid=%X, opt/prio=%d, rate=%u, buff=%d, quant=%d\n", cl,cl?cl->classid:0,classid,parentid,(int)hopt->prio,hopt->rate.rate,hopt->buffer,hopt->quantum); + rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB-1]); + ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB-1]); + if (!rtab || !ctab) goto failure; + + if (!cl) { /* new class */ + struct Qdisc *new_q; + /* check for valid classid */ + if (!classid || TC_H_MAJ(classid^sch->handle) || htb_find(classid,sch)) + goto failure; + + /* check maximal depth */ + if (parent && parent->parent && parent->parent->level < 2) { + printk(KERN_ERR "htb: tree is too deep\n"); + goto failure; + } + err = -ENOBUFS; + if ((cl = kmalloc(sizeof(*cl), GFP_KERNEL)) == NULL) + goto failure; + + memset(cl, 0, sizeof(*cl)); + cl->refcnt = 1; + INIT_LIST_HEAD(&cl->sibling); + INIT_LIST_HEAD(&cl->hlist); + INIT_LIST_HEAD(&cl->children); + INIT_LIST_HEAD(&cl->un.leaf.drop_list); +#ifdef HTB_DEBUG + cl->magic = HTB_CMAGIC; +#endif + + /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) + so that can't be used inside of sch_tree_lock + -- thanks to Karlis Peisenieks */ + new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + sch_tree_lock(sch); + if (parent && !parent->level) { + /* turn parent into inner node */ + sch->q.qlen -= parent->un.leaf.q->q.qlen; + qdisc_destroy (parent->un.leaf.q); + if (parent->prio_activity) + htb_deactivate (q,parent); + + /* remove from evt list because of level change */ + if (parent->cmode != HTB_CAN_SEND) { + htb_safe_rb_erase(&parent->pq_node,q->wait_pq /*+0*/); + parent->cmode = HTB_CAN_SEND; + } + parent->level = (parent->parent ? parent->parent->level + : TC_HTB_MAXDEPTH) - 1; + memset (&parent->un.inner,0,sizeof(parent->un.inner)); + } + /* leaf (we) needs elementary qdisc */ + cl->un.leaf.q = new_q ? new_q : &noop_qdisc; + + cl->classid = classid; cl->parent = parent; + + /* set class to be in HTB_CAN_SEND state */ + cl->tokens = hopt->buffer; + cl->ctokens = hopt->cbuffer; + cl->mbuffer = 60000000; /* 1min */ + PSCHED_GET_TIME(cl->t_c); + cl->cmode = HTB_CAN_SEND; + + /* attach to the hash list and parent's family */ + list_add_tail(&cl->hlist, q->hash+htb_hash(classid)); + list_add_tail(&cl->sibling, parent ? &parent->children : &q->root); +#ifdef HTB_DEBUG + { + int i; + for (i = 0; i < TC_HTB_NUMPRIO; i++) cl->node[i].rb_color = -1; + cl->pq_node.rb_color = -1; + } +#endif + } else sch_tree_lock(sch); + + /* it used to be a nasty bug here, we have to check that node + is really leaf before changing cl->un.leaf ! */ + if (!cl->level) { + cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum; + if (!hopt->quantum && cl->un.leaf.quantum < 1000) { + printk(KERN_WARNING "HTB: quantum of class %X is small. Consider r2q change.\n", cl->classid); + cl->un.leaf.quantum = 1000; + } + if (!hopt->quantum && cl->un.leaf.quantum > 200000) { + printk(KERN_WARNING "HTB: quantum of class %X is big. Consider r2q change.\n", cl->classid); + cl->un.leaf.quantum = 200000; + } + if (hopt->quantum) + cl->un.leaf.quantum = hopt->quantum; + if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO) + cl->un.leaf.prio = TC_HTB_NUMPRIO - 1; + } + + cl->buffer = hopt->buffer; + cl->cbuffer = hopt->cbuffer; + if (cl->rate) qdisc_put_rtab(cl->rate); cl->rate = rtab; + if (cl->ceil) qdisc_put_rtab(cl->ceil); cl->ceil = ctab; + sch_tree_unlock(sch); + + *arg = (unsigned long)cl; + return 0; + +failure: + if (rtab) qdisc_put_rtab(rtab); + if (ctab) qdisc_put_rtab(ctab); + return err; +} + +static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = (struct htb_class *)arg; + struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list; + HTB_DBG(0,2,"htb_tcf q=%p clid=%X fref=%d fl=%p\n",q,cl?cl->classid:0,cl?cl->filter_cnt:q->filter_cnt,*fl); + return fl; +} + +static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = htb_find (classid,sch); + HTB_DBG(0,2,"htb_bind q=%p clid=%X cl=%p fref=%d\n",q,classid,cl,cl?cl->filter_cnt:q->filter_cnt); + /*if (cl && !cl->level) return 0; + The line above used to be there to prevent attaching filters to + leaves. But at least tc_index filter uses this just to get class + for other reasons so that we have to allow for it. + ---- + 19.6.2002 As Werner explained it is ok - bind filter is just + another way to "lock" the class - unlike "get" this lock can + be broken by class during destroy IIUC. + */ + if (cl) + cl->filter_cnt++; + else + q->filter_cnt++; + return (unsigned long)cl; +} + +static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = (struct htb_class *)arg; + HTB_DBG(0,2,"htb_unbind q=%p cl=%p fref=%d\n",q,cl,cl?cl->filter_cnt:q->filter_cnt); + if (cl) + cl->filter_cnt--; + else + q->filter_cnt--; +} + +static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct htb_sched *q = qdisc_priv(sch); + int i; + + if (arg->stop) + return; + + for (i = 0; i < HTB_HSIZE; i++) { + struct list_head *p; + list_for_each (p,q->hash+i) { + struct htb_class *cl = list_entry(p,struct htb_class,hlist); + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static struct Qdisc_class_ops htb_class_ops = { + .graft = htb_graft, + .leaf = htb_leaf, + .get = htb_get, + .put = htb_put, + .change = htb_change_class, + .delete = htb_delete, + .walk = htb_walk, + .tcf_chain = htb_find_tcf, + .bind_tcf = htb_bind_filter, + .unbind_tcf = htb_unbind_filter, + .dump = htb_dump_class, + .dump_stats = htb_dump_class_stats, +}; + +static struct Qdisc_ops htb_qdisc_ops = { + .next = NULL, + .cl_ops = &htb_class_ops, + .id = "htb", + .priv_size = sizeof(struct htb_sched), + .enqueue = htb_enqueue, + .dequeue = htb_dequeue, + .requeue = htb_requeue, + .drop = htb_drop, + .init = htb_init, + .reset = htb_reset, + .destroy = htb_destroy, + .change = NULL /* htb_change */, + .dump = htb_dump, + .owner = THIS_MODULE, +}; + +static int __init htb_module_init(void) +{ + return register_qdisc(&htb_qdisc_ops); +} +static void __exit htb_module_exit(void) +{ + unregister_qdisc(&htb_qdisc_ops); +} +module_init(htb_module_init) +module_exit(htb_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c new file mode 100644 index 00000000000..8edc32a6ad2 --- /dev/null +++ b/net/sched/sch_ingress.c @@ -0,0 +1,436 @@ +/* net/sched/sch_ingress.c - Ingress qdisc + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Jamal Hadi Salim 1999 + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter.h> +#include <linux/smp.h> +#include <net/pkt_sched.h> +#include <asm/byteorder.h> +#include <asm/uaccess.h> +#include <linux/kmod.h> +#include <linux/stat.h> +#include <linux/interrupt.h> +#include <linux/list.h> + + +#undef DEBUG_INGRESS + +#ifdef DEBUG_INGRESS /* control */ +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +#if 0 /* data */ +#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define D2PRINTK(format,args...) +#endif + + +#define PRIV(sch) qdisc_priv(sch) + + +/* Thanks to Doron Oz for this hack +*/ +#ifndef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NETFILTER +static int nf_registered; +#endif +#endif + +struct ingress_qdisc_data { + struct Qdisc *q; + struct tcf_proto *filter_list; +}; + + +/* ------------------------- Class/flow operations ------------------------- */ + + +static int ingress_graft(struct Qdisc *sch,unsigned long arg, + struct Qdisc *new,struct Qdisc **old) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + + DPRINTK("ingress_graft(sch %p,[qdisc %p],new %p,old %p)\n", + sch, p, new, old); + DPRINTK("\n ingress_graft: You cannot add qdiscs to classes"); + return 1; +} + + +static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + + +static unsigned long ingress_get(struct Qdisc *sch,u32 classid) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + DPRINTK("ingress_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid); + return TC_H_MIN(classid) + 1; +} + + +static unsigned long ingress_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return ingress_get(sch, classid); +} + + +static void ingress_put(struct Qdisc *sch, unsigned long cl) +{ +} + + +static int ingress_change(struct Qdisc *sch, u32 classid, u32 parent, + struct rtattr **tca, unsigned long *arg) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + DPRINTK("ingress_change(sch %p,[qdisc %p],classid %x,parent %x)," + "arg 0x%lx\n", sch, p, classid, parent, *arg); + DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); + return 0; +} + + + +static void ingress_walk(struct Qdisc *sch,struct qdisc_walker *walker) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + DPRINTK("ingress_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); + DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); +} + + +static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch,unsigned long cl) +{ + struct ingress_qdisc_data *p = PRIV(sch); + + return &p->filter_list; +} + + +/* --------------------------- Qdisc operations ---------------------------- */ + + +static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch) +{ + struct ingress_qdisc_data *p = PRIV(sch); + struct tcf_result res; + int result; + + D2PRINTK("ingress_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); + result = tc_classify(skb, p->filter_list, &res); + D2PRINTK("result %d class 0x%04x\n", result, res.classid); + /* + * Unlike normal "enqueue" functions, ingress_enqueue returns a + * firewall FW_* code. + */ +#ifdef CONFIG_NET_CLS_ACT + sch->bstats.packets++; + sch->bstats.bytes += skb->len; + switch (result) { + case TC_ACT_SHOT: + result = TC_ACT_SHOT; + sch->qstats.drops++; + break; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + result = TC_ACT_STOLEN; + break; + case TC_ACT_RECLASSIFY: + case TC_ACT_OK: + case TC_ACT_UNSPEC: + default: + skb->tc_index = TC_H_MIN(res.classid); + result = TC_ACT_OK; + break; + }; +/* backward compat */ +#else +#ifdef CONFIG_NET_CLS_POLICE + switch (result) { + case TC_POLICE_SHOT: + result = NF_DROP; + sch->qstats.drops++; + break; + case TC_POLICE_RECLASSIFY: /* DSCP remarking here ? */ + case TC_POLICE_OK: + case TC_POLICE_UNSPEC: + default: + sch->bstats.packets++; + sch->bstats.bytes += skb->len; + result = NF_ACCEPT; + break; + }; + +#else + D2PRINTK("Overriding result to ACCEPT\n"); + result = NF_ACCEPT; + sch->bstats.packets++; + sch->bstats.bytes += skb->len; +#endif +#endif + + return result; +} + + +static struct sk_buff *ingress_dequeue(struct Qdisc *sch) +{ +/* + struct ingress_qdisc_data *p = PRIV(sch); + D2PRINTK("ingress_dequeue(sch %p,[qdisc %p])\n",sch,PRIV(p)); +*/ + return NULL; +} + + +static int ingress_requeue(struct sk_buff *skb,struct Qdisc *sch) +{ +/* + struct ingress_qdisc_data *p = PRIV(sch); + D2PRINTK("ingress_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,PRIV(p)); +*/ + return 0; +} + +static unsigned int ingress_drop(struct Qdisc *sch) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + DPRINTK("ingress_drop(sch %p,[qdisc %p])\n", sch, p); + return 0; +} + +#ifndef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NETFILTER +static unsigned int +ing_hook(unsigned int hook, struct sk_buff **pskb, + const struct net_device *indev, + const struct net_device *outdev, + int (*okfn)(struct sk_buff *)) +{ + + struct Qdisc *q; + struct sk_buff *skb = *pskb; + struct net_device *dev = skb->dev; + int fwres=NF_ACCEPT; + + DPRINTK("ing_hook: skb %s dev=%s len=%u\n", + skb->sk ? "(owned)" : "(unowned)", + skb->dev ? (*pskb)->dev->name : "(no dev)", + skb->len); + +/* +revisit later: Use a private since lock dev->queue_lock is also +used on the egress (might slow things for an iota) +*/ + + if (dev->qdisc_ingress) { + spin_lock(&dev->queue_lock); + if ((q = dev->qdisc_ingress) != NULL) + fwres = q->enqueue(skb, q); + spin_unlock(&dev->queue_lock); + } + + return fwres; +} + +/* after ipt_filter */ +static struct nf_hook_ops ing_ops = { + .hook = ing_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_FILTER + 1, +}; + +static struct nf_hook_ops ing6_ops = { + .hook = ing_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_PRE_ROUTING, + .priority = NF_IP6_PRI_FILTER + 1, +}; + +#endif +#endif + +static int ingress_init(struct Qdisc *sch,struct rtattr *opt) +{ + struct ingress_qdisc_data *p = PRIV(sch); + +/* Make sure either netfilter or preferably CLS_ACT is +* compiled in */ +#ifndef CONFIG_NET_CLS_ACT +#ifndef CONFIG_NETFILTER + printk("You MUST compile classifier actions into the kernel\n"); + return -EINVAL; +#else + printk("Ingress scheduler: Classifier actions prefered over netfilter\n"); +#endif +#endif + +#ifndef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NETFILTER + if (!nf_registered) { + if (nf_register_hook(&ing_ops) < 0) { + printk("ingress qdisc registration error \n"); + return -EINVAL; + } + nf_registered++; + + if (nf_register_hook(&ing6_ops) < 0) { + printk("IPv6 ingress qdisc registration error, " \ + "disabling IPv6 support.\n"); + } else + nf_registered++; + } +#endif +#endif + + DPRINTK("ingress_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); + p->q = &noop_qdisc; + return 0; +} + + +static void ingress_reset(struct Qdisc *sch) +{ + struct ingress_qdisc_data *p = PRIV(sch); + + DPRINTK("ingress_reset(sch %p,[qdisc %p])\n", sch, p); + +/* +#if 0 +*/ +/* for future use */ + qdisc_reset(p->q); +/* +#endif +*/ +} + +/* ------------------------------------------------------------- */ + + +/* ------------------------------------------------------------- */ + +static void ingress_destroy(struct Qdisc *sch) +{ + struct ingress_qdisc_data *p = PRIV(sch); + struct tcf_proto *tp; + + DPRINTK("ingress_destroy(sch %p,[qdisc %p])\n", sch, p); + while (p->filter_list) { + tp = p->filter_list; + p->filter_list = tp->next; + tcf_destroy(tp); + } +#if 0 +/* for future use */ + qdisc_destroy(p->q); +#endif +} + + +static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + unsigned char *b = skb->tail; + struct rtattr *rta; + + rta = (struct rtattr *) b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + rta->rta_len = skb->tail - b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct Qdisc_class_ops ingress_class_ops = { + .graft = ingress_graft, + .leaf = ingress_leaf, + .get = ingress_get, + .put = ingress_put, + .change = ingress_change, + .delete = NULL, + .walk = ingress_walk, + .tcf_chain = ingress_find_tcf, + .bind_tcf = ingress_bind_filter, + .unbind_tcf = ingress_put, + .dump = NULL, +}; + +static struct Qdisc_ops ingress_qdisc_ops = { + .next = NULL, + .cl_ops = &ingress_class_ops, + .id = "ingress", + .priv_size = sizeof(struct ingress_qdisc_data), + .enqueue = ingress_enqueue, + .dequeue = ingress_dequeue, + .requeue = ingress_requeue, + .drop = ingress_drop, + .init = ingress_init, + .reset = ingress_reset, + .destroy = ingress_destroy, + .change = NULL, + .dump = ingress_dump, + .owner = THIS_MODULE, +}; + +static int __init ingress_module_init(void) +{ + int ret = 0; + + if ((ret = register_qdisc(&ingress_qdisc_ops)) < 0) { + printk("Unable to register Ingress qdisc\n"); + return ret; + } + + return ret; +} +static void __exit ingress_module_exit(void) +{ + unregister_qdisc(&ingress_qdisc_ops); +#ifndef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NETFILTER + if (nf_registered) { + nf_unregister_hook(&ing_ops); + if (nf_registered > 1) + nf_unregister_hook(&ing6_ops); + } +#endif +#endif +} +module_init(ingress_module_init) +module_exit(ingress_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c new file mode 100644 index 00000000000..31c29deb139 --- /dev/null +++ b/net/sched/sch_netem.c @@ -0,0 +1,598 @@ +/* + * net/sched/sch_netem.c Network emulator + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Many of the algorithms and ideas for this came from + * NIST Net which is not copyrighted. + * + * Authors: Stephen Hemminger <shemminger@osdl.org> + * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> + +#include <net/pkt_sched.h> + +/* Network Emulation Queuing algorithm. + ==================================== + + Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based + Network Emulation Tool + [2] Luigi Rizzo, DummyNet for FreeBSD + + ---------------------------------------------------------------- + + This started out as a simple way to delay outgoing packets to + test TCP but has grown to include most of the functionality + of a full blown network emulator like NISTnet. It can delay + packets and add random jitter (and correlation). The random + distribution can be loaded from a table as well to provide + normal, Pareto, or experimental curves. Packet loss, + duplication, and reordering can also be emulated. + + This qdisc does not do classification that can be handled in + layering other disciplines. It does not need to do bandwidth + control either since that can be handled by using token + bucket or other rate control. + + The simulator is limited by the Linux timer resolution + and will create packet bursts on the HZ boundary (1ms). +*/ + +struct netem_sched_data { + struct Qdisc *qdisc; + struct sk_buff_head delayed; + struct timer_list timer; + + u32 latency; + u32 loss; + u32 limit; + u32 counter; + u32 gap; + u32 jitter; + u32 duplicate; + + struct crndstate { + unsigned long last; + unsigned long rho; + } delay_cor, loss_cor, dup_cor; + + struct disttable { + u32 size; + s16 table[0]; + } *delay_dist; +}; + +/* Time stamp put into socket buffer control block */ +struct netem_skb_cb { + psched_time_t time_to_send; +}; + +/* init_crandom - initialize correlated random number generator + * Use entropy source for initial seed. + */ +static void init_crandom(struct crndstate *state, unsigned long rho) +{ + state->rho = rho; + state->last = net_random(); +} + +/* get_crandom - correlated random number generator + * Next number depends on last value. + * rho is scaled to avoid floating point. + */ +static unsigned long get_crandom(struct crndstate *state) +{ + u64 value, rho; + unsigned long answer; + + if (state->rho == 0) /* no correllation */ + return net_random(); + + value = net_random(); + rho = (u64)state->rho + 1; + answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; + state->last = answer; + return answer; +} + +/* tabledist - return a pseudo-randomly distributed value with mean mu and + * std deviation sigma. Uses table lookup to approximate the desired + * distribution, and a uniformly-distributed pseudo-random source. + */ +static long tabledist(unsigned long mu, long sigma, + struct crndstate *state, const struct disttable *dist) +{ + long t, x; + unsigned long rnd; + + if (sigma == 0) + return mu; + + rnd = get_crandom(state); + + /* default uniform distribution */ + if (dist == NULL) + return (rnd % (2*sigma)) - sigma + mu; + + t = dist->table[rnd % dist->size]; + x = (sigma % NETEM_DIST_SCALE) * t; + if (x >= 0) + x += NETEM_DIST_SCALE/2; + else + x -= NETEM_DIST_SCALE/2; + + return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; +} + +/* Put skb in the private delayed queue. */ +static int delay_skb(struct Qdisc *sch, struct sk_buff *skb) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; + psched_tdiff_t td; + psched_time_t now; + + PSCHED_GET_TIME(now); + td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist); + PSCHED_TADD2(now, td, cb->time_to_send); + + /* Always queue at tail to keep packets in order */ + if (likely(q->delayed.qlen < q->limit)) { + __skb_queue_tail(&q->delayed, skb); + if (!timer_pending(&q->timer)) { + q->timer.expires = jiffies + PSCHED_US2JIFFIE(td); + add_timer(&q->timer); + } + return NET_XMIT_SUCCESS; + } + + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb2; + int ret; + + pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies); + + /* Random packet drop 0 => none, ~0 => all */ + if (q->loss && q->loss >= get_crandom(&q->loss_cor)) { + pr_debug("netem_enqueue: random loss\n"); + sch->qstats.drops++; + kfree_skb(skb); + return 0; /* lie about loss so TCP doesn't know */ + } + + /* Random duplication */ + if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor) + && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { + pr_debug("netem_enqueue: dup %p\n", skb2); + + if (delay_skb(sch, skb2)) { + sch->q.qlen++; + sch->bstats.bytes += skb2->len; + sch->bstats.packets++; + } else + sch->qstats.drops++; + } + + /* If doing simple delay then gap == 0 so all packets + * go into the delayed holding queue + * otherwise if doing out of order only "1 out of gap" + * packets will be delayed. + */ + if (q->counter < q->gap) { + ++q->counter; + ret = q->qdisc->enqueue(skb, q->qdisc); + } else { + q->counter = 0; + ret = delay_skb(sch, skb); + } + + if (likely(ret == NET_XMIT_SUCCESS)) { + sch->q.qlen++; + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + } else + sch->qstats.drops++; + + return ret; +} + +/* Requeue packets but don't change time stamp */ +static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + int ret; + + if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) { + sch->q.qlen++; + sch->qstats.requeues++; + } + + return ret; +} + +static unsigned int netem_drop(struct Qdisc* sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + unsigned int len; + + if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) { + sch->q.qlen--; + sch->qstats.drops++; + } + return len; +} + +/* Dequeue packet. + * Move all packets that are ready to send from the delay holding + * list to the underlying qdisc, then just call dequeue + */ +static struct sk_buff *netem_dequeue(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + skb = q->qdisc->dequeue(q->qdisc); + if (skb) + sch->q.qlen--; + return skb; +} + +static void netem_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + struct netem_sched_data *q = qdisc_priv(sch); + struct net_device *dev = sch->dev; + struct sk_buff *skb; + psched_time_t now; + + pr_debug("netem_watchdog: fired @%lu\n", jiffies); + + spin_lock_bh(&dev->queue_lock); + PSCHED_GET_TIME(now); + + while ((skb = skb_peek(&q->delayed)) != NULL) { + const struct netem_skb_cb *cb + = (const struct netem_skb_cb *)skb->cb; + long delay + = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); + pr_debug("netem_watchdog: skb %p@%lu %ld\n", + skb, jiffies, delay); + + /* if more time remaining? */ + if (delay > 0) { + mod_timer(&q->timer, jiffies + delay); + break; + } + __skb_unlink(skb, &q->delayed); + + if (q->qdisc->enqueue(skb, q->qdisc)) { + sch->q.qlen--; + sch->qstats.drops++; + } + } + qdisc_run(dev); + spin_unlock_bh(&dev->queue_lock); +} + +static void netem_reset(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + qdisc_reset(q->qdisc); + skb_queue_purge(&q->delayed); + + sch->q.qlen = 0; + del_timer_sync(&q->timer); +} + +static int set_fifo_limit(struct Qdisc *q, int limit) +{ + struct rtattr *rta; + int ret = -ENOMEM; + + rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); + if (rta) { + rta->rta_type = RTM_NEWQDISC; + rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); + ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; + + ret = q->ops->change(q, rta); + kfree(rta); + } + return ret; +} + +/* + * Distribution data is a variable size payload containing + * signed 16 bit values. + */ +static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16); + const __s16 *data = RTA_DATA(attr); + struct disttable *d; + int i; + + if (n > 65536) + return -EINVAL; + + d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL); + if (!d) + return -ENOMEM; + + d->size = n; + for (i = 0; i < n; i++) + d->table[i] = data[i]; + + spin_lock_bh(&sch->dev->queue_lock); + d = xchg(&q->delay_dist, d); + spin_unlock_bh(&sch->dev->queue_lock); + + kfree(d); + return 0; +} + +static int get_correlation(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_corr *c = RTA_DATA(attr); + + if (RTA_PAYLOAD(attr) != sizeof(*c)) + return -EINVAL; + + init_crandom(&q->delay_cor, c->delay_corr); + init_crandom(&q->loss_cor, c->loss_corr); + init_crandom(&q->dup_cor, c->dup_corr); + return 0; +} + +static int netem_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct tc_netem_qopt *qopt; + int ret; + + if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + return -EINVAL; + + qopt = RTA_DATA(opt); + ret = set_fifo_limit(q->qdisc, qopt->limit); + if (ret) { + pr_debug("netem: can't set fifo limit\n"); + return ret; + } + + q->latency = qopt->latency; + q->jitter = qopt->jitter; + q->limit = qopt->limit; + q->gap = qopt->gap; + q->loss = qopt->loss; + q->duplicate = qopt->duplicate; + + /* Handle nested options after initial queue options. + * Should have put all options in nested format but too late now. + */ + if (RTA_PAYLOAD(opt) > sizeof(*qopt)) { + struct rtattr *tb[TCA_NETEM_MAX]; + if (rtattr_parse(tb, TCA_NETEM_MAX, + RTA_DATA(opt) + sizeof(*qopt), + RTA_PAYLOAD(opt) - sizeof(*qopt))) + return -EINVAL; + + if (tb[TCA_NETEM_CORR-1]) { + ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]); + if (ret) + return ret; + } + + if (tb[TCA_NETEM_DELAY_DIST-1]) { + ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]); + if (ret) + return ret; + } + } + + + return 0; +} + +static int netem_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct netem_sched_data *q = qdisc_priv(sch); + int ret; + + if (!opt) + return -EINVAL; + + skb_queue_head_init(&q->delayed); + init_timer(&q->timer); + q->timer.function = netem_watchdog; + q->timer.data = (unsigned long) sch; + q->counter = 0; + + q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (!q->qdisc) { + pr_debug("netem: qdisc create failed\n"); + return -ENOMEM; + } + + ret = netem_change(sch, opt); + if (ret) { + pr_debug("netem: change failed\n"); + qdisc_destroy(q->qdisc); + } + return ret; +} + +static void netem_destroy(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + del_timer_sync(&q->timer); + qdisc_destroy(q->qdisc); + kfree(q->delay_dist); +} + +static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + const struct netem_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct rtattr *rta = (struct rtattr *) b; + struct tc_netem_qopt qopt; + struct tc_netem_corr cor; + + qopt.latency = q->latency; + qopt.jitter = q->jitter; + qopt.limit = q->limit; + qopt.loss = q->loss; + qopt.gap = q->gap; + qopt.duplicate = q->duplicate; + RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + + cor.delay_corr = q->delay_cor.rho; + cor.loss_corr = q->loss_cor.rho; + cor.dup_corr = q->dup_cor.rho; + RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int netem_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + if (cl != 1) /* only one class */ + return -ENOENT; + + tcm->tcm_handle |= TC_H_MIN(1); + tcm->tcm_info = q->qdisc->handle; + + return 0; +} + +static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = xchg(&q->qdisc, new); + qdisc_reset(*old); + sch->q.qlen = 0; + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct netem_sched_data *q = qdisc_priv(sch); + return q->qdisc; +} + +static unsigned long netem_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void netem_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct rtattr **tca, unsigned long *arg) +{ + return -ENOSYS; +} + +static int netem_delete(struct Qdisc *sch, unsigned long arg) +{ + return -ENOSYS; +} + +static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + return NULL; +} + +static struct Qdisc_class_ops netem_class_ops = { + .graft = netem_graft, + .leaf = netem_leaf, + .get = netem_get, + .put = netem_put, + .change = netem_change_class, + .delete = netem_delete, + .walk = netem_walk, + .tcf_chain = netem_find_tcf, + .dump = netem_dump_class, +}; + +static struct Qdisc_ops netem_qdisc_ops = { + .id = "netem", + .cl_ops = &netem_class_ops, + .priv_size = sizeof(struct netem_sched_data), + .enqueue = netem_enqueue, + .dequeue = netem_dequeue, + .requeue = netem_requeue, + .drop = netem_drop, + .init = netem_init, + .reset = netem_reset, + .destroy = netem_destroy, + .change = netem_change, + .dump = netem_dump, + .owner = THIS_MODULE, +}; + + +static int __init netem_module_init(void) +{ + return register_qdisc(&netem_qdisc_ops); +} +static void __exit netem_module_exit(void) +{ + unregister_qdisc(&netem_qdisc_ops); +} +module_init(netem_module_init) +module_exit(netem_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c new file mode 100644 index 00000000000..3ac0f495bad --- /dev/null +++ b/net/sched/sch_prio.c @@ -0,0 +1,444 @@ +/* + * net/sched/sch_prio.c Simple 3-band priority "scheduler". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>: + * Init -- EINVAL when opt undefined + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +struct prio_sched_data +{ + int bands; + struct tcf_proto *filter_list; + u8 prio2band[TC_PRIO_MAX+1]; + struct Qdisc *queues[TCQ_PRIO_BANDS]; +}; + + +static struct Qdisc * +prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct prio_sched_data *q = qdisc_priv(sch); + u32 band = skb->priority; + struct tcf_result res; + + *qerr = NET_XMIT_DROP; + if (TC_H_MAJ(skb->priority) != sch->handle) { +#ifdef CONFIG_NET_CLS_ACT + switch (tc_classify(skb, q->filter_list, &res)) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS; + case TC_ACT_SHOT: + return NULL; + }; + + if (!q->filter_list ) { +#else + if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) { +#endif + if (TC_H_MAJ(band)) + band = 0; + return q->queues[q->prio2band[band&TC_PRIO_MAX]]; + } + band = res.classid; + } + band = TC_H_MIN(band) - 1; + if (band > q->bands) + return q->queues[q->prio2band[0]]; + + return q->queues[band]; +} + +static int +prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct Qdisc *qdisc; + int ret; + + qdisc = prio_classify(skb, sch, &ret); +#ifdef CONFIG_NET_CLS_ACT + if (qdisc == NULL) { + if (ret == NET_XMIT_DROP) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } +#endif + + if ((ret = qdisc->enqueue(skb, qdisc)) == NET_XMIT_SUCCESS) { + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + sch->q.qlen++; + return NET_XMIT_SUCCESS; + } + sch->qstats.drops++; + return ret; +} + + +static int +prio_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct Qdisc *qdisc; + int ret; + + qdisc = prio_classify(skb, sch, &ret); +#ifdef CONFIG_NET_CLS_ACT + if (qdisc == NULL) { + if (ret == NET_XMIT_DROP) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } +#endif + + if ((ret = qdisc->ops->requeue(skb, qdisc)) == NET_XMIT_SUCCESS) { + sch->q.qlen++; + sch->qstats.requeues++; + return 0; + } + sch->qstats.drops++; + return NET_XMIT_DROP; +} + + +static struct sk_buff * +prio_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct prio_sched_data *q = qdisc_priv(sch); + int prio; + struct Qdisc *qdisc; + + for (prio = 0; prio < q->bands; prio++) { + qdisc = q->queues[prio]; + skb = qdisc->dequeue(qdisc); + if (skb) { + sch->q.qlen--; + return skb; + } + } + return NULL; + +} + +static unsigned int prio_drop(struct Qdisc* sch) +{ + struct prio_sched_data *q = qdisc_priv(sch); + int prio; + unsigned int len; + struct Qdisc *qdisc; + + for (prio = q->bands-1; prio >= 0; prio--) { + qdisc = q->queues[prio]; + if ((len = qdisc->ops->drop(qdisc)) != 0) { + sch->q.qlen--; + return len; + } + } + return 0; +} + + +static void +prio_reset(struct Qdisc* sch) +{ + int prio; + struct prio_sched_data *q = qdisc_priv(sch); + + for (prio=0; prio<q->bands; prio++) + qdisc_reset(q->queues[prio]); + sch->q.qlen = 0; +} + +static void +prio_destroy(struct Qdisc* sch) +{ + int prio; + struct prio_sched_data *q = qdisc_priv(sch); + struct tcf_proto *tp; + + while ((tp = q->filter_list) != NULL) { + q->filter_list = tp->next; + tcf_destroy(tp); + } + + for (prio=0; prio<q->bands; prio++) + qdisc_destroy(q->queues[prio]); +} + +static int prio_tune(struct Qdisc *sch, struct rtattr *opt) +{ + struct prio_sched_data *q = qdisc_priv(sch); + struct tc_prio_qopt *qopt = RTA_DATA(opt); + int i; + + if (opt->rta_len < RTA_LENGTH(sizeof(*qopt))) + return -EINVAL; + if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2) + return -EINVAL; + + for (i=0; i<=TC_PRIO_MAX; i++) { + if (qopt->priomap[i] >= qopt->bands) + return -EINVAL; + } + + sch_tree_lock(sch); + q->bands = qopt->bands; + memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); + + for (i=q->bands; i<TCQ_PRIO_BANDS; i++) { + struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc); + if (child != &noop_qdisc) + qdisc_destroy(child); + } + sch_tree_unlock(sch); + + for (i=0; i<=TC_PRIO_MAX; i++) { + int band = q->prio2band[i]; + if (q->queues[band] == &noop_qdisc) { + struct Qdisc *child; + child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (child) { + sch_tree_lock(sch); + child = xchg(&q->queues[band], child); + + if (child != &noop_qdisc) + qdisc_destroy(child); + sch_tree_unlock(sch); + } + } + } + return 0; +} + +static int prio_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct prio_sched_data *q = qdisc_priv(sch); + int i; + + for (i=0; i<TCQ_PRIO_BANDS; i++) + q->queues[i] = &noop_qdisc; + + if (opt == NULL) { + return -EINVAL; + } else { + int err; + + if ((err= prio_tune(sch, opt)) != 0) + return err; + } + return 0; +} + +static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct prio_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct tc_prio_qopt opt; + + opt.bands = q->bands; + memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct prio_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + if (band >= q->bands) + return -EINVAL; + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->queues[band]; + q->queues[band] = new; + sch->q.qlen -= (*old)->q.qlen; + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc * +prio_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct prio_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + if (band >= q->bands) + return NULL; + + return q->queues[band]; +} + +static unsigned long prio_get(struct Qdisc *sch, u32 classid) +{ + struct prio_sched_data *q = qdisc_priv(sch); + unsigned long band = TC_H_MIN(classid); + + if (band - 1 >= q->bands) + return 0; + return band; +} + +static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid) +{ + return prio_get(sch, classid); +} + + +static void prio_put(struct Qdisc *q, unsigned long cl) +{ + return; +} + +static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg) +{ + unsigned long cl = *arg; + struct prio_sched_data *q = qdisc_priv(sch); + + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + +static int prio_delete(struct Qdisc *sch, unsigned long cl) +{ + struct prio_sched_data *q = qdisc_priv(sch); + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + + +static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, + struct tcmsg *tcm) +{ + struct prio_sched_data *q = qdisc_priv(sch); + + if (cl - 1 > q->bands) + return -ENOENT; + tcm->tcm_handle |= TC_H_MIN(cl); + if (q->queues[cl-1]) + tcm->tcm_info = q->queues[cl-1]->handle; + return 0; +} + +static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct prio_sched_data *q = qdisc_priv(sch); + int prio; + + if (arg->stop) + return; + + for (prio = 0; prio < q->bands; prio++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, prio+1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct prio_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static struct Qdisc_class_ops prio_class_ops = { + .graft = prio_graft, + .leaf = prio_leaf, + .get = prio_get, + .put = prio_put, + .change = prio_change, + .delete = prio_delete, + .walk = prio_walk, + .tcf_chain = prio_find_tcf, + .bind_tcf = prio_bind, + .unbind_tcf = prio_put, + .dump = prio_dump_class, +}; + +static struct Qdisc_ops prio_qdisc_ops = { + .next = NULL, + .cl_ops = &prio_class_ops, + .id = "prio", + .priv_size = sizeof(struct prio_sched_data), + .enqueue = prio_enqueue, + .dequeue = prio_dequeue, + .requeue = prio_requeue, + .drop = prio_drop, + .init = prio_init, + .reset = prio_reset, + .destroy = prio_destroy, + .change = prio_tune, + .dump = prio_dump, + .owner = THIS_MODULE, +}; + +static int __init prio_module_init(void) +{ + return register_qdisc(&prio_qdisc_ops); +} + +static void __exit prio_module_exit(void) +{ + unregister_qdisc(&prio_qdisc_ops); +} + +module_init(prio_module_init) +module_exit(prio_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c new file mode 100644 index 00000000000..664d0e47374 --- /dev/null +++ b/net/sched/sch_red.c @@ -0,0 +1,459 @@ +/* + * net/sched/sch_red.c Random Early Detection queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * J Hadi Salim <hadi@nortel.com> 980914: computation fixes + * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly. + * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> +#include <net/dsfield.h> + + +/* Random Early Detection (RED) algorithm. + ======================================= + + Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways + for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. + + This file codes a "divisionless" version of RED algorithm + as written down in Fig.17 of the paper. + +Short description. +------------------ + + When a new packet arrives we calculate the average queue length: + + avg = (1-W)*avg + W*current_queue_len, + + W is the filter time constant (chosen as 2^(-Wlog)), it controls + the inertia of the algorithm. To allow larger bursts, W should be + decreased. + + if (avg > th_max) -> packet marked (dropped). + if (avg < th_min) -> packet passes. + if (th_min < avg < th_max) we calculate probability: + + Pb = max_P * (avg - th_min)/(th_max-th_min) + + and mark (drop) packet with this probability. + Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). + max_P should be small (not 1), usually 0.01..0.02 is good value. + + max_P is chosen as a number, so that max_P/(th_max-th_min) + is a negative power of two in order arithmetics to contain + only shifts. + + + Parameters, settable by user: + ----------------------------- + + limit - bytes (must be > qth_max + burst) + + Hard limit on queue length, should be chosen >qth_max + to allow packet bursts. This parameter does not + affect the algorithms behaviour and can be chosen + arbitrarily high (well, less than ram size) + Really, this limit will never be reached + if RED works correctly. + + qth_min - bytes (should be < qth_max/2) + qth_max - bytes (should be at least 2*qth_min and less limit) + Wlog - bits (<32) log(1/W). + Plog - bits (<32) + + Plog is related to max_P by formula: + + max_P = (qth_max-qth_min)/2^Plog; + + F.e. if qth_max=128K and qth_min=32K, then Plog=22 + corresponds to max_P=0.02 + + Scell_log + Stab + + Lookup table for log((1-W)^(t/t_ave). + + +NOTES: + +Upper bound on W. +----------------- + + If you want to allow bursts of L packets of size S, + you should choose W: + + L + 1 - th_min/S < (1-(1-W)^L)/W + + th_min/S = 32 th_min/S = 4 + + log(W) L + -1 33 + -2 35 + -3 39 + -4 46 + -5 57 + -6 75 + -7 101 + -8 135 + -9 190 + etc. + */ + +struct red_sched_data +{ +/* Parameters */ + u32 limit; /* HARD maximal queue length */ + u32 qth_min; /* Min average length threshold: A scaled */ + u32 qth_max; /* Max average length threshold: A scaled */ + u32 Rmask; + u32 Scell_max; + unsigned char flags; + char Wlog; /* log(W) */ + char Plog; /* random number bits */ + char Scell_log; + u8 Stab[256]; + +/* Variables */ + unsigned long qave; /* Average queue length: A scaled */ + int qcount; /* Packets since last random number generation */ + u32 qR; /* Cached random number */ + + psched_time_t qidlestart; /* Start of idle period */ + struct tc_red_xstats st; +}; + +static int red_ecn_mark(struct sk_buff *skb) +{ + if (skb->nh.raw + 20 > skb->tail) + return 0; + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + if (INET_ECN_is_not_ect(skb->nh.iph->tos)) + return 0; + IP_ECN_set_ce(skb->nh.iph); + return 1; + case __constant_htons(ETH_P_IPV6): + if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h))) + return 0; + IP6_ECN_set_ce(skb->nh.ipv6h); + return 1; + default: + return 0; + } +} + +static int +red_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct red_sched_data *q = qdisc_priv(sch); + + psched_time_t now; + + if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { + long us_idle; + int shift; + + PSCHED_GET_TIME(now); + us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); + PSCHED_SET_PASTPERFECT(q->qidlestart); + +/* + The problem: ideally, average length queue recalcultion should + be done over constant clock intervals. This is too expensive, so that + the calculation is driven by outgoing packets. + When the queue is idle we have to model this clock by hand. + + SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth) + dummy packets as a burst after idle time, i.e. + + q->qave *= (1-W)^m + + This is an apparently overcomplicated solution (f.e. we have to precompute + a table to make this calculation in reasonable time) + I believe that a simpler model may be used here, + but it is field for experiments. +*/ + shift = q->Stab[us_idle>>q->Scell_log]; + + if (shift) { + q->qave >>= shift; + } else { + /* Approximate initial part of exponent + with linear function: + (1-W)^m ~= 1-mW + ... + + Seems, it is the best solution to + problem of too coarce exponent tabulation. + */ + + us_idle = (q->qave * us_idle)>>q->Scell_log; + if (us_idle < q->qave/2) + q->qave -= us_idle; + else + q->qave >>= 1; + } + } else { + q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); + /* NOTE: + q->qave is fixed point number with point at Wlog. + The formulae above is equvalent to floating point + version: + + qave = qave*(1-W) + sch->qstats.backlog*W; + --ANK (980924) + */ + } + + if (q->qave < q->qth_min) { + q->qcount = -1; +enqueue: + if (sch->qstats.backlog + skb->len <= q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->qstats.backlog += skb->len; + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return NET_XMIT_SUCCESS; + } else { + q->st.pdrop++; + } + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_DROP; + } + if (q->qave >= q->qth_max) { + q->qcount = -1; + sch->qstats.overlimits++; +mark: + if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) { + q->st.early++; + goto drop; + } + q->st.marked++; + goto enqueue; + } + + if (++q->qcount) { + /* The formula used below causes questions. + + OK. qR is random number in the interval 0..Rmask + i.e. 0..(2^Plog). If we used floating point + arithmetics, it would be: (2^Plog)*rnd_num, + where rnd_num is less 1. + + Taking into account, that qave have fixed + point at Wlog, and Plog is related to max_P by + max_P = (qth_max-qth_min)/2^Plog; two lines + below have the following floating point equivalent: + + max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount + + Any questions? --ANK (980924) + */ + if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR) + goto enqueue; + q->qcount = 0; + q->qR = net_random()&q->Rmask; + sch->qstats.overlimits++; + goto mark; + } + q->qR = net_random()&q->Rmask; + goto enqueue; + +drop: + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_CN; +} + +static int +red_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct red_sched_data *q = qdisc_priv(sch); + + PSCHED_SET_PASTPERFECT(q->qidlestart); + + __skb_queue_head(&sch->q, skb); + sch->qstats.backlog += skb->len; + sch->qstats.requeues++; + return 0; +} + +static struct sk_buff * +red_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct red_sched_data *q = qdisc_priv(sch); + + skb = __skb_dequeue(&sch->q); + if (skb) { + sch->qstats.backlog -= skb->len; + return skb; + } + PSCHED_GET_TIME(q->qidlestart); + return NULL; +} + +static unsigned int red_drop(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct red_sched_data *q = qdisc_priv(sch); + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + unsigned int len = skb->len; + sch->qstats.backlog -= len; + sch->qstats.drops++; + q->st.other++; + kfree_skb(skb); + return len; + } + PSCHED_GET_TIME(q->qidlestart); + return 0; +} + +static void red_reset(struct Qdisc* sch) +{ + struct red_sched_data *q = qdisc_priv(sch); + + __skb_queue_purge(&sch->q); + sch->qstats.backlog = 0; + PSCHED_SET_PASTPERFECT(q->qidlestart); + q->qave = 0; + q->qcount = -1; +} + +static int red_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct rtattr *tb[TCA_RED_STAB]; + struct tc_red_qopt *ctl; + + if (opt == NULL || + rtattr_parse_nested(tb, TCA_RED_STAB, opt) || + tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 || + RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || + RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256) + return -EINVAL; + + ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); + + sch_tree_lock(sch); + q->flags = ctl->flags; + q->Wlog = ctl->Wlog; + q->Plog = ctl->Plog; + q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; + q->Scell_log = ctl->Scell_log; + q->Scell_max = (255<<q->Scell_log); + q->qth_min = ctl->qth_min<<ctl->Wlog; + q->qth_max = ctl->qth_max<<ctl->Wlog; + q->limit = ctl->limit; + memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); + + q->qcount = -1; + if (skb_queue_len(&sch->q) == 0) + PSCHED_SET_PASTPERFECT(q->qidlestart); + sch_tree_unlock(sch); + return 0; +} + +static int red_init(struct Qdisc* sch, struct rtattr *opt) +{ + return red_change(sch, opt); +} + +static int red_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct red_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_red_qopt opt; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + opt.limit = q->limit; + opt.qth_min = q->qth_min>>q->Wlog; + opt.qth_max = q->qth_max>>q->Wlog; + opt.Wlog = q->Wlog; + opt.Plog = q->Plog; + opt.Scell_log = q->Scell_log; + opt.flags = q->flags; + RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct red_sched_data *q = qdisc_priv(sch); + + return gnet_stats_copy_app(d, &q->st, sizeof(q->st)); +} + +static struct Qdisc_ops red_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "red", + .priv_size = sizeof(struct red_sched_data), + .enqueue = red_enqueue, + .dequeue = red_dequeue, + .requeue = red_requeue, + .drop = red_drop, + .init = red_init, + .reset = red_reset, + .change = red_change, + .dump = red_dump, + .dump_stats = red_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init red_module_init(void) +{ + return register_qdisc(&red_qdisc_ops); +} +static void __exit red_module_exit(void) +{ + unregister_qdisc(&red_qdisc_ops); +} +module_init(red_module_init) +module_exit(red_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c new file mode 100644 index 00000000000..8734bb7280e --- /dev/null +++ b/net/sched/sch_sfq.c @@ -0,0 +1,497 @@ +/* + * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +/* Stochastic Fairness Queuing algorithm. + ======================================= + + Source: + Paul E. McKenney "Stochastic Fairness Queuing", + IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. + + Paul E. McKenney "Stochastic Fairness Queuing", + "Interworking: Research and Experience", v.2, 1991, p.113-131. + + + See also: + M. Shreedhar and George Varghese "Efficient Fair + Queuing using Deficit Round Robin", Proc. SIGCOMM 95. + + + This is not the thing that is usually called (W)FQ nowadays. + It does not use any timestamp mechanism, but instead + processes queues in round-robin order. + + ADVANTAGE: + + - It is very cheap. Both CPU and memory requirements are minimal. + + DRAWBACKS: + + - "Stochastic" -> It is not 100% fair. + When hash collisions occur, several flows are considered as one. + + - "Round-robin" -> It introduces larger delays than virtual clock + based schemes, and should not be used for isolating interactive + traffic from non-interactive. It means, that this scheduler + should be used as leaf of CBQ or P3, which put interactive traffic + to higher priority band. + + We still need true WFQ for top level CSZ, but using WFQ + for the best effort traffic is absolutely pointless: + SFQ is superior for this purpose. + + IMPLEMENTATION: + This implementation limits maximal queue length to 128; + maximal mtu to 2^15-1; number of hash buckets to 1024. + The only goal of this restrictions was that all data + fit into one 4K page :-). Struct sfq_sched_data is + organized in anti-cache manner: all the data for a bucket + are scattered over different locations. This is not good, + but it allowed me to put it into 4K. + + It is easy to increase these values, but not in flight. */ + +#define SFQ_DEPTH 128 +#define SFQ_HASH_DIVISOR 1024 + +/* This type should contain at least SFQ_DEPTH*2 values */ +typedef unsigned char sfq_index; + +struct sfq_head +{ + sfq_index next; + sfq_index prev; +}; + +struct sfq_sched_data +{ +/* Parameters */ + int perturb_period; + unsigned quantum; /* Allotment per round: MUST BE >= MTU */ + int limit; + +/* Variables */ + struct timer_list perturb_timer; + int perturbation; + sfq_index tail; /* Index of current slot in round */ + sfq_index max_depth; /* Maximal depth */ + + sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */ + sfq_index next[SFQ_DEPTH]; /* Active slots link */ + short allot[SFQ_DEPTH]; /* Current allotment per slot */ + unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */ + struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */ + struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */ +}; + +static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) +{ + int pert = q->perturbation; + + /* Have we any rotation primitives? If not, WHY? */ + h ^= (h1<<pert) ^ (h1>>(0x1F - pert)); + h ^= h>>10; + return h & 0x3FF; +} + +static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) +{ + u32 h, h2; + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + { + struct iphdr *iph = skb->nh.iph; + h = iph->daddr; + h2 = iph->saddr^iph->protocol; + if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && + (iph->protocol == IPPROTO_TCP || + iph->protocol == IPPROTO_UDP || + iph->protocol == IPPROTO_ESP)) + h2 ^= *(((u32*)iph) + iph->ihl); + break; + } + case __constant_htons(ETH_P_IPV6): + { + struct ipv6hdr *iph = skb->nh.ipv6h; + h = iph->daddr.s6_addr32[3]; + h2 = iph->saddr.s6_addr32[3]^iph->nexthdr; + if (iph->nexthdr == IPPROTO_TCP || + iph->nexthdr == IPPROTO_UDP || + iph->nexthdr == IPPROTO_ESP) + h2 ^= *(u32*)&iph[1]; + break; + } + default: + h = (u32)(unsigned long)skb->dst^skb->protocol; + h2 = (u32)(unsigned long)skb->sk; + } + return sfq_fold_hash(q, h, h2); +} + +static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + int d = q->qs[x].qlen + SFQ_DEPTH; + + p = d; + n = q->dep[d].next; + q->dep[x].next = n; + q->dep[x].prev = p; + q->dep[p].next = q->dep[n].prev = x; +} + +static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + + n = q->dep[x].next; + p = q->dep[x].prev; + q->dep[p].next = n; + q->dep[n].prev = p; + + if (n == p && q->max_depth == q->qs[x].qlen + 1) + q->max_depth--; + + sfq_link(q, x); +} + +static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + int d; + + n = q->dep[x].next; + p = q->dep[x].prev; + q->dep[p].next = n; + q->dep[n].prev = p; + d = q->qs[x].qlen; + if (q->max_depth < d) + q->max_depth = d; + + sfq_link(q, x); +} + +static unsigned int sfq_drop(struct Qdisc *sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + sfq_index d = q->max_depth; + struct sk_buff *skb; + unsigned int len; + + /* Queue is full! Find the longest slot and + drop a packet from it */ + + if (d > 1) { + sfq_index x = q->dep[d+SFQ_DEPTH].next; + skb = q->qs[x].prev; + len = skb->len; + __skb_unlink(skb, &q->qs[x]); + kfree_skb(skb); + sfq_dec(q, x); + sch->q.qlen--; + sch->qstats.drops++; + return len; + } + + if (d == 1) { + /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ + d = q->next[q->tail]; + q->next[q->tail] = q->next[d]; + q->allot[q->next[d]] += q->quantum; + skb = q->qs[d].prev; + len = skb->len; + __skb_unlink(skb, &q->qs[d]); + kfree_skb(skb); + sfq_dec(q, d); + sch->q.qlen--; + q->ht[q->hash[d]] = SFQ_DEPTH; + sch->qstats.drops++; + return len; + } + + return 0; +} + +static int +sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + unsigned hash = sfq_hash(q, skb); + sfq_index x; + + x = q->ht[hash]; + if (x == SFQ_DEPTH) { + q->ht[hash] = x = q->dep[SFQ_DEPTH].next; + q->hash[x] = hash; + } + __skb_queue_tail(&q->qs[x], skb); + sfq_inc(q, x); + if (q->qs[x].qlen == 1) { /* The flow is new */ + if (q->tail == SFQ_DEPTH) { /* It is the first flow */ + q->tail = x; + q->next[x] = x; + q->allot[x] = q->quantum; + } else { + q->next[x] = q->next[q->tail]; + q->next[q->tail] = x; + q->tail = x; + } + } + if (++sch->q.qlen < q->limit-1) { + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return 0; + } + + sfq_drop(sch); + return NET_XMIT_CN; +} + +static int +sfq_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + unsigned hash = sfq_hash(q, skb); + sfq_index x; + + x = q->ht[hash]; + if (x == SFQ_DEPTH) { + q->ht[hash] = x = q->dep[SFQ_DEPTH].next; + q->hash[x] = hash; + } + __skb_queue_head(&q->qs[x], skb); + sfq_inc(q, x); + if (q->qs[x].qlen == 1) { /* The flow is new */ + if (q->tail == SFQ_DEPTH) { /* It is the first flow */ + q->tail = x; + q->next[x] = x; + q->allot[x] = q->quantum; + } else { + q->next[x] = q->next[q->tail]; + q->next[q->tail] = x; + q->tail = x; + } + } + if (++sch->q.qlen < q->limit - 1) { + sch->qstats.requeues++; + return 0; + } + + sch->qstats.drops++; + sfq_drop(sch); + return NET_XMIT_CN; +} + + + + +static struct sk_buff * +sfq_dequeue(struct Qdisc* sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + sfq_index a, old_a; + + /* No active slots */ + if (q->tail == SFQ_DEPTH) + return NULL; + + a = old_a = q->next[q->tail]; + + /* Grab packet */ + skb = __skb_dequeue(&q->qs[a]); + sfq_dec(q, a); + sch->q.qlen--; + + /* Is the slot empty? */ + if (q->qs[a].qlen == 0) { + q->ht[q->hash[a]] = SFQ_DEPTH; + a = q->next[a]; + if (a == old_a) { + q->tail = SFQ_DEPTH; + return skb; + } + q->next[q->tail] = a; + q->allot[a] += q->quantum; + } else if ((q->allot[a] -= skb->len) <= 0) { + q->tail = a; + a = q->next[a]; + q->allot[a] += q->quantum; + } + return skb; +} + +static void +sfq_reset(struct Qdisc* sch) +{ + struct sk_buff *skb; + + while ((skb = sfq_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static void sfq_perturbation(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct sfq_sched_data *q = qdisc_priv(sch); + + q->perturbation = net_random()&0x1F; + + if (q->perturb_period) { + q->perturb_timer.expires = jiffies + q->perturb_period; + add_timer(&q->perturb_timer); + } +} + +static int sfq_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + struct tc_sfq_qopt *ctl = RTA_DATA(opt); + + if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + return -EINVAL; + + sch_tree_lock(sch); + q->quantum = ctl->quantum ? : psched_mtu(sch->dev); + q->perturb_period = ctl->perturb_period*HZ; + if (ctl->limit) + q->limit = min_t(u32, ctl->limit, SFQ_DEPTH); + + while (sch->q.qlen >= q->limit-1) + sfq_drop(sch); + + del_timer(&q->perturb_timer); + if (q->perturb_period) { + q->perturb_timer.expires = jiffies + q->perturb_period; + add_timer(&q->perturb_timer); + } + sch_tree_unlock(sch); + return 0; +} + +static int sfq_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + int i; + + init_timer(&q->perturb_timer); + q->perturb_timer.data = (unsigned long)sch; + q->perturb_timer.function = sfq_perturbation; + + for (i=0; i<SFQ_HASH_DIVISOR; i++) + q->ht[i] = SFQ_DEPTH; + for (i=0; i<SFQ_DEPTH; i++) { + skb_queue_head_init(&q->qs[i]); + q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH; + q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH; + } + q->limit = SFQ_DEPTH; + q->max_depth = 0; + q->tail = SFQ_DEPTH; + if (opt == NULL) { + q->quantum = psched_mtu(sch->dev); + q->perturb_period = 0; + } else { + int err = sfq_change(sch, opt); + if (err) + return err; + } + for (i=0; i<SFQ_DEPTH; i++) + sfq_link(q, i); + return 0; +} + +static void sfq_destroy(struct Qdisc *sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + del_timer(&q->perturb_timer); +} + +static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct tc_sfq_qopt opt; + + opt.quantum = q->quantum; + opt.perturb_period = q->perturb_period/HZ; + + opt.limit = q->limit; + opt.divisor = SFQ_HASH_DIVISOR; + opt.flows = q->limit; + + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct Qdisc_ops sfq_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "sfq", + .priv_size = sizeof(struct sfq_sched_data), + .enqueue = sfq_enqueue, + .dequeue = sfq_dequeue, + .requeue = sfq_requeue, + .drop = sfq_drop, + .init = sfq_init, + .reset = sfq_reset, + .destroy = sfq_destroy, + .change = NULL, + .dump = sfq_dump, + .owner = THIS_MODULE, +}; + +static int __init sfq_module_init(void) +{ + return register_qdisc(&sfq_qdisc_ops); +} +static void __exit sfq_module_exit(void) +{ + unregister_qdisc(&sfq_qdisc_ops); +} +module_init(sfq_module_init) +module_exit(sfq_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c new file mode 100644 index 00000000000..cb9711ea8c6 --- /dev/null +++ b/net/sched/sch_tbf.c @@ -0,0 +1,543 @@ +/* + * net/sched/sch_tbf.c Token Bucket Filter queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs - + * original idea by Martin Devera + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +/* Simple Token Bucket Filter. + ======================================= + + SOURCE. + ------- + + None. + + Description. + ------------ + + A data flow obeys TBF with rate R and depth B, if for any + time interval t_i...t_f the number of transmitted bits + does not exceed B + R*(t_f-t_i). + + Packetized version of this definition: + The sequence of packets of sizes s_i served at moments t_i + obeys TBF, if for any i<=k: + + s_i+....+s_k <= B + R*(t_k - t_i) + + Algorithm. + ---------- + + Let N(t_i) be B/R initially and N(t) grow continuously with time as: + + N(t+delta) = min{B/R, N(t) + delta} + + If the first packet in queue has length S, it may be + transmitted only at the time t_* when S/R <= N(t_*), + and in this case N(t) jumps: + + N(t_* + 0) = N(t_* - 0) - S/R. + + + + Actually, QoS requires two TBF to be applied to a data stream. + One of them controls steady state burst size, another + one with rate P (peak rate) and depth M (equal to link MTU) + limits bursts at a smaller time scale. + + It is easy to see that P>R, and B>M. If P is infinity, this double + TBF is equivalent to a single one. + + When TBF works in reshaping mode, latency is estimated as: + + lat = max ((L-B)/R, (L-M)/P) + + + NOTES. + ------ + + If TBF throttles, it starts a watchdog timer, which will wake it up + when it is ready to transmit. + Note that the minimal timer resolution is 1/HZ. + If no new packets arrive during this period, + or if the device is not awaken by EOI for some previous packet, + TBF can stop its activity for 1/HZ. + + + This means, that with depth B, the maximal rate is + + R_crit = B*HZ + + F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes. + + Note that the peak rate TBF is much more tough: with MTU 1500 + P_crit = 150Kbytes/sec. So, if you need greater peak + rates, use alpha with HZ=1000 :-) + + With classful TBF, limit is just kept for backwards compatibility. + It is passed to the default bfifo qdisc - if the inner qdisc is + changed the limit is not effective anymore. +*/ + +struct tbf_sched_data +{ +/* Parameters */ + u32 limit; /* Maximal length of backlog: bytes */ + u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ + u32 mtu; + u32 max_size; + struct qdisc_rate_table *R_tab; + struct qdisc_rate_table *P_tab; + +/* Variables */ + long tokens; /* Current number of B tokens */ + long ptokens; /* Current number of P tokens */ + psched_time_t t_c; /* Time check-point */ + struct timer_list wd_timer; /* Watchdog timer */ + struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ +}; + +#define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log]) +#define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log]) + +static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + int ret; + + if (skb->len > q->max_size) { + sch->qstats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + + return NET_XMIT_DROP; + } + + if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) { + sch->qstats.drops++; + return ret; + } + + sch->q.qlen++; + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return 0; +} + +static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + int ret; + + if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) { + sch->q.qlen++; + sch->qstats.requeues++; + } + + return ret; +} + +static unsigned int tbf_drop(struct Qdisc* sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + unsigned int len; + + if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) { + sch->q.qlen--; + sch->qstats.drops++; + } + return len; +} + +static void tbf_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + +static struct sk_buff *tbf_dequeue(struct Qdisc* sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + skb = q->qdisc->dequeue(q->qdisc); + + if (skb) { + psched_time_t now; + long toks, delay; + long ptoks = 0; + unsigned int len = skb->len; + + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer); + + if (q->P_tab) { + ptoks = toks + q->ptokens; + if (ptoks > (long)q->mtu) + ptoks = q->mtu; + ptoks -= L2T_P(q, len); + } + toks += q->tokens; + if (toks > (long)q->buffer) + toks = q->buffer; + toks -= L2T(q, len); + + if ((toks|ptoks) >= 0) { + q->t_c = now; + q->tokens = toks; + q->ptokens = ptoks; + sch->q.qlen--; + sch->flags &= ~TCQ_F_THROTTLED; + return skb; + } + + delay = PSCHED_US2JIFFIE(max_t(long, -toks, -ptoks)); + + if (delay == 0) + delay = 1; + + mod_timer(&q->wd_timer, jiffies+delay); + + /* Maybe we have a shorter packet in the queue, + which can be sent now. It sounds cool, + but, however, this is wrong in principle. + We MUST NOT reorder packets under these circumstances. + + Really, if we split the flow into independent + subflows, it would be a very good solution. + This is the main idea of all FQ algorithms + (cf. CSZ, HPFQ, HFSC) + */ + + if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { + /* When requeue fails skb is dropped */ + sch->q.qlen--; + sch->qstats.drops++; + } + + sch->flags |= TCQ_F_THROTTLED; + sch->qstats.overlimits++; + } + return NULL; +} + +static void tbf_reset(struct Qdisc* sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + qdisc_reset(q->qdisc); + sch->q.qlen = 0; + PSCHED_GET_TIME(q->t_c); + q->tokens = q->buffer; + q->ptokens = q->mtu; + sch->flags &= ~TCQ_F_THROTTLED; + del_timer(&q->wd_timer); +} + +static struct Qdisc *tbf_create_dflt_qdisc(struct net_device *dev, u32 limit) +{ + struct Qdisc *q = qdisc_create_dflt(dev, &bfifo_qdisc_ops); + struct rtattr *rta; + int ret; + + if (q) { + rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); + if (rta) { + rta->rta_type = RTM_NEWQDISC; + rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); + ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; + + ret = q->ops->change(q, rta); + kfree(rta); + + if (ret == 0) + return q; + } + qdisc_destroy(q); + } + + return NULL; +} + +static int tbf_change(struct Qdisc* sch, struct rtattr *opt) +{ + int err = -EINVAL; + struct tbf_sched_data *q = qdisc_priv(sch); + struct rtattr *tb[TCA_TBF_PTAB]; + struct tc_tbf_qopt *qopt; + struct qdisc_rate_table *rtab = NULL; + struct qdisc_rate_table *ptab = NULL; + struct Qdisc *child = NULL; + int max_size,n; + + if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) || + tb[TCA_TBF_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt)) + goto done; + + qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]); + rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]); + if (rtab == NULL) + goto done; + + if (qopt->peakrate.rate) { + if (qopt->peakrate.rate > qopt->rate.rate) + ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]); + if (ptab == NULL) + goto done; + } + + for (n = 0; n < 256; n++) + if (rtab->data[n] > qopt->buffer) break; + max_size = (n << qopt->rate.cell_log)-1; + if (ptab) { + int size; + + for (n = 0; n < 256; n++) + if (ptab->data[n] > qopt->mtu) break; + size = (n << qopt->peakrate.cell_log)-1; + if (size < max_size) max_size = size; + } + if (max_size < 0) + goto done; + + if (q->qdisc == &noop_qdisc) { + if ((child = tbf_create_dflt_qdisc(sch->dev, qopt->limit)) == NULL) + goto done; + } + + sch_tree_lock(sch); + if (child) q->qdisc = child; + q->limit = qopt->limit; + q->mtu = qopt->mtu; + q->max_size = max_size; + q->buffer = qopt->buffer; + q->tokens = q->buffer; + q->ptokens = q->mtu; + rtab = xchg(&q->R_tab, rtab); + ptab = xchg(&q->P_tab, ptab); + sch_tree_unlock(sch); + err = 0; +done: + if (rtab) + qdisc_put_rtab(rtab); + if (ptab) + qdisc_put_rtab(ptab); + return err; +} + +static int tbf_init(struct Qdisc* sch, struct rtattr *opt) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + if (opt == NULL) + return -EINVAL; + + PSCHED_GET_TIME(q->t_c); + init_timer(&q->wd_timer); + q->wd_timer.function = tbf_watchdog; + q->wd_timer.data = (unsigned long)sch; + + q->qdisc = &noop_qdisc; + + return tbf_change(sch, opt); +} + +static void tbf_destroy(struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + del_timer(&q->wd_timer); + + if (q->P_tab) + qdisc_put_rtab(q->P_tab); + if (q->R_tab) + qdisc_put_rtab(q->R_tab); + + qdisc_destroy(q->qdisc); +} + +static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_tbf_qopt opt; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt.limit = q->limit; + opt.rate = q->R_tab->rate; + if (q->P_tab) + opt.peakrate = q->P_tab->rate; + else + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); + opt.mtu = q->mtu; + opt.buffer = q->buffer; + RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tbf_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + if (cl != 1) /* only one class */ + return -ENOENT; + + tcm->tcm_handle |= TC_H_MIN(1); + tcm->tcm_info = q->qdisc->handle; + + return 0; +} + +static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = xchg(&q->qdisc, new); + qdisc_reset(*old); + sch->q.qlen = 0; + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + return q->qdisc; +} + +static unsigned long tbf_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void tbf_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct rtattr **tca, unsigned long *arg) +{ + return -ENOSYS; +} + +static int tbf_delete(struct Qdisc *sch, unsigned long arg) +{ + return -ENOSYS; +} + +static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + return NULL; +} + +static struct Qdisc_class_ops tbf_class_ops = +{ + .graft = tbf_graft, + .leaf = tbf_leaf, + .get = tbf_get, + .put = tbf_put, + .change = tbf_change_class, + .delete = tbf_delete, + .walk = tbf_walk, + .tcf_chain = tbf_find_tcf, + .dump = tbf_dump_class, +}; + +static struct Qdisc_ops tbf_qdisc_ops = { + .next = NULL, + .cl_ops = &tbf_class_ops, + .id = "tbf", + .priv_size = sizeof(struct tbf_sched_data), + .enqueue = tbf_enqueue, + .dequeue = tbf_dequeue, + .requeue = tbf_requeue, + .drop = tbf_drop, + .init = tbf_init, + .reset = tbf_reset, + .destroy = tbf_destroy, + .change = tbf_change, + .dump = tbf_dump, + .owner = THIS_MODULE, +}; + +static int __init tbf_module_init(void) +{ + return register_qdisc(&tbf_qdisc_ops); +} + +static void __exit tbf_module_exit(void) +{ + unregister_qdisc(&tbf_qdisc_ops); +} +module_init(tbf_module_init) +module_exit(tbf_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c new file mode 100644 index 00000000000..6cf0342706b --- /dev/null +++ b/net/sched/sch_teql.c @@ -0,0 +1,511 @@ +/* net/sched/sch_teql.c "True" (or "trivial") link equalizer. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <linux/moduleparam.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* + How to setup it. + ---------------- + + After loading this module you will find a new device teqlN + and new qdisc with the same name. To join a slave to the equalizer + you should just set this qdisc on a device f.e. + + # tc qdisc add dev eth0 root teql0 + # tc qdisc add dev eth1 root teql0 + + That's all. Full PnP 8) + + Applicability. + -------------- + + 1. Slave devices MUST be active devices, i.e., they must raise the tbusy + signal and generate EOI events. If you want to equalize virtual devices + like tunnels, use a normal eql device. + 2. This device puts no limitations on physical slave characteristics + f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-) + Certainly, large difference in link speeds will make the resulting + eqalized link unusable, because of huge packet reordering. + I estimate an upper useful difference as ~10 times. + 3. If the slave requires address resolution, only protocols using + neighbour cache (IPv4/IPv6) will work over the equalized link. + Other protocols are still allowed to use the slave device directly, + which will not break load balancing, though native slave + traffic will have the highest priority. */ + +struct teql_master +{ + struct Qdisc_ops qops; + struct net_device *dev; + struct Qdisc *slaves; + struct list_head master_list; + struct net_device_stats stats; +}; + +struct teql_sched_data +{ + struct Qdisc *next; + struct teql_master *m; + struct neighbour *ncache; + struct sk_buff_head q; +}; + +#define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next) + +#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT|IFF_BROADCAST) + +/* "teql*" qdisc routines */ + +static int +teql_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct net_device *dev = sch->dev; + struct teql_sched_data *q = qdisc_priv(sch); + + __skb_queue_tail(&q->q, skb); + if (q->q.qlen <= dev->tx_queue_len) { + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return 0; + } + + __skb_unlink(skb, &q->q); + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_DROP; +} + +static int +teql_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct teql_sched_data *q = qdisc_priv(sch); + + __skb_queue_head(&q->q, skb); + sch->qstats.requeues++; + return 0; +} + +static struct sk_buff * +teql_dequeue(struct Qdisc* sch) +{ + struct teql_sched_data *dat = qdisc_priv(sch); + struct sk_buff *skb; + + skb = __skb_dequeue(&dat->q); + if (skb == NULL) { + struct net_device *m = dat->m->dev->qdisc->dev; + if (m) { + dat->m->slaves = sch; + netif_wake_queue(m); + } + } + sch->q.qlen = dat->q.qlen + dat->m->dev->qdisc->q.qlen; + return skb; +} + +static __inline__ void +teql_neigh_release(struct neighbour *n) +{ + if (n) + neigh_release(n); +} + +static void +teql_reset(struct Qdisc* sch) +{ + struct teql_sched_data *dat = qdisc_priv(sch); + + skb_queue_purge(&dat->q); + sch->q.qlen = 0; + teql_neigh_release(xchg(&dat->ncache, NULL)); +} + +static void +teql_destroy(struct Qdisc* sch) +{ + struct Qdisc *q, *prev; + struct teql_sched_data *dat = qdisc_priv(sch); + struct teql_master *master = dat->m; + + if ((prev = master->slaves) != NULL) { + do { + q = NEXT_SLAVE(prev); + if (q == sch) { + NEXT_SLAVE(prev) = NEXT_SLAVE(q); + if (q == master->slaves) { + master->slaves = NEXT_SLAVE(q); + if (q == master->slaves) { + master->slaves = NULL; + spin_lock_bh(&master->dev->queue_lock); + qdisc_reset(master->dev->qdisc); + spin_unlock_bh(&master->dev->queue_lock); + } + } + skb_queue_purge(&dat->q); + teql_neigh_release(xchg(&dat->ncache, NULL)); + break; + } + + } while ((prev = q) != master->slaves); + } +} + +static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct net_device *dev = sch->dev; + struct teql_master *m = (struct teql_master*)sch->ops; + struct teql_sched_data *q = qdisc_priv(sch); + + if (dev->hard_header_len > m->dev->hard_header_len) + return -EINVAL; + + if (m->dev == dev) + return -ELOOP; + + q->m = m; + + skb_queue_head_init(&q->q); + + if (m->slaves) { + if (m->dev->flags & IFF_UP) { + if ((m->dev->flags&IFF_POINTOPOINT && !(dev->flags&IFF_POINTOPOINT)) + || (m->dev->flags&IFF_BROADCAST && !(dev->flags&IFF_BROADCAST)) + || (m->dev->flags&IFF_MULTICAST && !(dev->flags&IFF_MULTICAST)) + || dev->mtu < m->dev->mtu) + return -EINVAL; + } else { + if (!(dev->flags&IFF_POINTOPOINT)) + m->dev->flags &= ~IFF_POINTOPOINT; + if (!(dev->flags&IFF_BROADCAST)) + m->dev->flags &= ~IFF_BROADCAST; + if (!(dev->flags&IFF_MULTICAST)) + m->dev->flags &= ~IFF_MULTICAST; + if (dev->mtu < m->dev->mtu) + m->dev->mtu = dev->mtu; + } + q->next = NEXT_SLAVE(m->slaves); + NEXT_SLAVE(m->slaves) = sch; + } else { + q->next = sch; + m->slaves = sch; + m->dev->mtu = dev->mtu; + m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK); + } + return 0; +} + +/* "teql*" netdevice routines */ + +static int +__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) +{ + struct teql_sched_data *q = qdisc_priv(dev->qdisc); + struct neighbour *mn = skb->dst->neighbour; + struct neighbour *n = q->ncache; + + if (mn->tbl == NULL) + return -EINVAL; + if (n && n->tbl == mn->tbl && + memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) { + atomic_inc(&n->refcnt); + } else { + n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev); + if (IS_ERR(n)) + return PTR_ERR(n); + } + if (neigh_event_send(n, skb_res) == 0) { + int err; + read_lock(&n->lock); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len); + read_unlock(&n->lock); + if (err < 0) { + neigh_release(n); + return -EINVAL; + } + teql_neigh_release(xchg(&q->ncache, n)); + return 0; + } + neigh_release(n); + return (skb_res == NULL) ? -EAGAIN : 1; +} + +static __inline__ int +teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) +{ + if (dev->hard_header == NULL || + skb->dst == NULL || + skb->dst->neighbour == NULL) + return 0; + return __teql_resolve(skb, skb_res, dev); +} + +static int teql_master_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct teql_master *master = (void*)dev->priv; + struct Qdisc *start, *q; + int busy; + int nores; + int len = skb->len; + struct sk_buff *skb_res = NULL; + + start = master->slaves; + +restart: + nores = 0; + busy = 0; + + if ((q = start) == NULL) + goto drop; + + do { + struct net_device *slave = q->dev; + + if (slave->qdisc_sleeping != q) + continue; + if (netif_queue_stopped(slave) || ! netif_running(slave)) { + busy = 1; + continue; + } + + switch (teql_resolve(skb, skb_res, slave)) { + case 0: + if (spin_trylock(&slave->xmit_lock)) { + slave->xmit_lock_owner = smp_processor_id(); + if (!netif_queue_stopped(slave) && + slave->hard_start_xmit(skb, slave) == 0) { + slave->xmit_lock_owner = -1; + spin_unlock(&slave->xmit_lock); + master->slaves = NEXT_SLAVE(q); + netif_wake_queue(dev); + master->stats.tx_packets++; + master->stats.tx_bytes += len; + return 0; + } + slave->xmit_lock_owner = -1; + spin_unlock(&slave->xmit_lock); + } + if (netif_queue_stopped(dev)) + busy = 1; + break; + case 1: + master->slaves = NEXT_SLAVE(q); + return 0; + default: + nores = 1; + break; + } + __skb_pull(skb, skb->nh.raw - skb->data); + } while ((q = NEXT_SLAVE(q)) != start); + + if (nores && skb_res == NULL) { + skb_res = skb; + goto restart; + } + + if (busy) { + netif_stop_queue(dev); + return 1; + } + master->stats.tx_errors++; + +drop: + master->stats.tx_dropped++; + dev_kfree_skb(skb); + return 0; +} + +static int teql_master_open(struct net_device *dev) +{ + struct Qdisc * q; + struct teql_master *m = (void*)dev->priv; + int mtu = 0xFFFE; + unsigned flags = IFF_NOARP|IFF_MULTICAST; + + if (m->slaves == NULL) + return -EUNATCH; + + flags = FMASK; + + q = m->slaves; + do { + struct net_device *slave = q->dev; + + if (slave == NULL) + return -EUNATCH; + + if (slave->mtu < mtu) + mtu = slave->mtu; + if (slave->hard_header_len > LL_MAX_HEADER) + return -EINVAL; + + /* If all the slaves are BROADCAST, master is BROADCAST + If all the slaves are PtP, master is PtP + Otherwise, master is NBMA. + */ + if (!(slave->flags&IFF_POINTOPOINT)) + flags &= ~IFF_POINTOPOINT; + if (!(slave->flags&IFF_BROADCAST)) + flags &= ~IFF_BROADCAST; + if (!(slave->flags&IFF_MULTICAST)) + flags &= ~IFF_MULTICAST; + } while ((q = NEXT_SLAVE(q)) != m->slaves); + + m->dev->mtu = mtu; + m->dev->flags = (m->dev->flags&~FMASK) | flags; + netif_start_queue(m->dev); + return 0; +} + +static int teql_master_close(struct net_device *dev) +{ + netif_stop_queue(dev); + return 0; +} + +static struct net_device_stats *teql_master_stats(struct net_device *dev) +{ + struct teql_master *m = (void*)dev->priv; + return &m->stats; +} + +static int teql_master_mtu(struct net_device *dev, int new_mtu) +{ + struct teql_master *m = (void*)dev->priv; + struct Qdisc *q; + + if (new_mtu < 68) + return -EINVAL; + + q = m->slaves; + if (q) { + do { + if (new_mtu > q->dev->mtu) + return -EINVAL; + } while ((q=NEXT_SLAVE(q)) != m->slaves); + } + + dev->mtu = new_mtu; + return 0; +} + +static __init void teql_master_setup(struct net_device *dev) +{ + struct teql_master *master = dev->priv; + struct Qdisc_ops *ops = &master->qops; + + master->dev = dev; + ops->priv_size = sizeof(struct teql_sched_data); + + ops->enqueue = teql_enqueue; + ops->dequeue = teql_dequeue; + ops->requeue = teql_requeue; + ops->init = teql_qdisc_init; + ops->reset = teql_reset; + ops->destroy = teql_destroy; + ops->owner = THIS_MODULE; + + dev->open = teql_master_open; + dev->hard_start_xmit = teql_master_xmit; + dev->stop = teql_master_close; + dev->get_stats = teql_master_stats; + dev->change_mtu = teql_master_mtu; + dev->type = ARPHRD_VOID; + dev->mtu = 1500; + dev->tx_queue_len = 100; + dev->flags = IFF_NOARP; + dev->hard_header_len = LL_MAX_HEADER; + SET_MODULE_OWNER(dev); +} + +static LIST_HEAD(master_dev_list); +static int max_equalizers = 1; +module_param(max_equalizers, int, 0); +MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers"); + +static int __init teql_init(void) +{ + int i; + int err = -ENODEV; + + for (i = 0; i < max_equalizers; i++) { + struct net_device *dev; + struct teql_master *master; + + dev = alloc_netdev(sizeof(struct teql_master), + "teql%d", teql_master_setup); + if (!dev) { + err = -ENOMEM; + break; + } + + if ((err = register_netdev(dev))) { + free_netdev(dev); + break; + } + + master = dev->priv; + + strlcpy(master->qops.id, dev->name, IFNAMSIZ); + err = register_qdisc(&master->qops); + + if (err) { + unregister_netdev(dev); + free_netdev(dev); + break; + } + + list_add_tail(&master->master_list, &master_dev_list); + } + return i ? 0 : err; +} + +static void __exit teql_exit(void) +{ + struct teql_master *master, *nxt; + + list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) { + + list_del(&master->master_list); + + unregister_qdisc(&master->qops); + unregister_netdev(master->dev); + free_netdev(master->dev); + } +} + +module_init(teql_init); +module_exit(teql_exit); + +MODULE_LICENSE("GPL"); |