From 1080d709fb9d8cd4392f93476ee46a9d6ea05a5b Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 27 Oct 2008 12:28:25 -0700 Subject: net: implement emergency route cache rebulds when gc_elasticity is exceeded This is a patch to provide on demand route cache rebuilding. Currently, our route cache is rebulid periodically regardless of need. This introduced unneeded periodic latency. This patch offers a better approach. Using code provided by Eric Dumazet, we compute the standard deviation of the average hash bucket chain length while running rt_check_expire. Should any given chain length grow to larger that average plus 4 standard deviations, we trigger an emergency hash table rebuild for that net namespace. This allows for the common case in which chains are well behaved and do not grow unevenly to not incur any latency at all, while those systems (which may be being maliciously attacked), only rebuild when the attack is detected. This patch take 2 other factors into account: 1) chains with multiple entries that differ by attributes that do not affect the hash value are only counted once, so as not to unduly bias system to rebuilding if features like QOS are heavily used 2) if rebuilding crosses a certain threshold (which is adjustable via the added sysctl in this patch), route caching is disabled entirely for that net namespace, since constant rebuilding is less efficient that no caching at all Tested successfully by me. Signed-off-by: Neil Horman Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'Documentation/networking') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index d84932650fd..c7712787933 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -27,6 +27,12 @@ min_adv_mss - INTEGER The advertised MSS depends on the first hop route MTU, but will never be lower than this setting. +rt_cache_rebuild_count - INTEGER + The per net-namespace route cache emergency rebuild threshold. + Any net-namespace having its route cache rebuilt due to + a hash bucket chain being too long more than this many times + will have its route caching disabled + IP Fragmentation: ipfrag_high_thresh - INTEGER -- cgit v1.2.3-70-g09d2 From 5c7f9b7363bfd10e40cf1a28dfc9048417df7028 Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Tue, 14 Oct 2008 10:38:03 -0600 Subject: ipw2x00: change default policy for auto-associate Do not attempt association until directed to do so by a user space application. In particular, this avoids race conditions with NetworkManager association state. Signed-off-by: Tim Gardner Acked-by: Dan Williams Signed-off-by: John W. Linville --- Documentation/networking/README.ipw2200 | 2 +- drivers/net/wireless/ipw2100.c | 4 ++-- drivers/net/wireless/ipw2200.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'Documentation/networking') diff --git a/Documentation/networking/README.ipw2200 b/Documentation/networking/README.ipw2200 index 4f2a40f1dbc..80c728522c4 100644 --- a/Documentation/networking/README.ipw2200 +++ b/Documentation/networking/README.ipw2200 @@ -147,7 +147,7 @@ Where the supported parameter are: driver. If disabled, the driver will not attempt to scan for and associate to a network until it has been configured with one or more properties for the target network, for example configuring - the network SSID. Default is 1 (auto-associate) + the network SSID. Default is 0 (do not auto-associate) Example: % modprobe ipw2200 associate=0 diff --git a/drivers/net/wireless/ipw2100.c b/drivers/net/wireless/ipw2100.c index 6e988d2486a..079dbd07e2f 100644 --- a/drivers/net/wireless/ipw2100.c +++ b/drivers/net/wireless/ipw2100.c @@ -185,7 +185,7 @@ MODULE_LICENSE("GPL"); static int debug = 0; static int mode = 0; static int channel = 0; -static int associate = 1; +static int associate = 0; static int disable = 0; #ifdef CONFIG_PM static struct ipw2100_fw ipw2100_firmware; @@ -201,7 +201,7 @@ module_param(disable, int, 0444); MODULE_PARM_DESC(debug, "debug level"); MODULE_PARM_DESC(mode, "network mode (0=BSS,1=IBSS,2=Monitor)"); MODULE_PARM_DESC(channel, "channel"); -MODULE_PARM_DESC(associate, "auto associate when scanning (default on)"); +MODULE_PARM_DESC(associate, "auto associate when scanning (default off)"); MODULE_PARM_DESC(disable, "manually disable the radio (default 0 [radio on])"); static u32 ipw2100_debug_level = IPW_DL_NONE; diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c index 13633d8274a..6ec6de2960e 100644 --- a/drivers/net/wireless/ipw2200.c +++ b/drivers/net/wireless/ipw2200.c @@ -87,7 +87,7 @@ static int channel = 0; static int mode = 0; static u32 ipw_debug_level; -static int associate = 1; +static int associate; static int auto_create = 1; static int led = 0; static int disable = 0; @@ -11913,7 +11913,7 @@ module_param(disable, int, 0444); MODULE_PARM_DESC(disable, "manually disable the radio (default 0 [radio on])"); module_param(associate, int, 0444); -MODULE_PARM_DESC(associate, "auto associate when scanning (default on)"); +MODULE_PARM_DESC(associate, "auto associate when scanning (default off)"); module_param(auto_create, int, 0444); MODULE_PARM_DESC(auto_create, "auto create adhoc network (default on)"); -- cgit v1.2.3-70-g09d2 From d2372b315289aec9f565a855023c40654a5bff68 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Oct 2008 20:32:20 +0200 Subject: wireless: make regdom passing semantics simpler The regdom struct is given to the core, so it might as well free it in error conditions. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- Documentation/networking/regulatory.txt | 13 +++---------- include/net/wireless.h | 3 +-- net/wireless/nl80211.c | 5 +---- net/wireless/reg.c | 9 +++++---- 4 files changed, 10 insertions(+), 20 deletions(-) (limited to 'Documentation/networking') diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt index a96989a8ff3..357d4ba4f13 100644 --- a/Documentation/networking/regulatory.txt +++ b/Documentation/networking/regulatory.txt @@ -167,7 +167,6 @@ struct ieee80211_regdomain mydriver_jp_regdom = { Then in some part of your code after your wiphy has been registered: - int r; struct ieee80211_regdomain *rd; int size_of_regd; int num_rules = mydriver_jp_regdom.n_reg_rules; @@ -178,17 +177,11 @@ Then in some part of your code after your wiphy has been registered: rd = kzalloc(size_of_regd, GFP_KERNEL); if (!rd) - return -ENOMEM; + return -ENOMEM; memcpy(rd, &mydriver_jp_regdom, sizeof(struct ieee80211_regdomain)); - for (i=0; i < num_rules; i++) { + for (i=0; i < num_rules; i++) memcpy(&rd->reg_rules[i], &mydriver_jp_regdom.reg_rules[i], sizeof(struct ieee80211_reg_rule)); - } - r = regulatory_hint(hw->wiphy, NULL, rd); - if (r) { - kfree(rd); - return r; - } - + return regulatory_hint(hw->wiphy, NULL, rd); diff --git a/include/net/wireless.h b/include/net/wireless.h index 061fe5017e5..6e3ea015904 100644 --- a/include/net/wireless.h +++ b/include/net/wireless.h @@ -358,8 +358,7 @@ ieee80211_get_channel(struct wiphy *wiphy, int freq) * for a regulatory domain structure for the respective country. If * a regulatory domain is build and passed you should set the alpha2 * if possible, otherwise set it to the special value of "99" which tells - * the wireless core it is unknown. If you pass a built regulatory domain - * and we return non zero you are in charge of kfree()'ing the structure. + * the wireless core it is unknown. * * Returns -EALREADY if *a regulatory domain* has already been set. Note that * this could be by another driver. It is safe for drivers to continue if diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9a16e9e6c5c..f82cc9aa690 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1935,12 +1935,9 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) mutex_lock(&cfg80211_drv_mutex); r = set_regdom(rd); mutex_unlock(&cfg80211_drv_mutex); - if (r) - goto bad_reg; - return r; -bad_reg: + bad_reg: kfree(rd); return -EINVAL; } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 00c326b66c0..038f8f133c5 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -605,7 +605,6 @@ int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by, return r; } -/* If rd is not NULL and if this call fails the caller must free it */ int regulatory_hint(struct wiphy *wiphy, const char *alpha2, struct ieee80211_regdomain *rd) { @@ -690,6 +689,7 @@ void print_regdomain_info(const struct ieee80211_regdomain *rd) print_rd_rules(rd); } +/* Takes ownership of rd only if it doesn't fail */ static int __set_regdom(const struct ieee80211_regdomain *rd) { /* Some basic sanity checks first */ @@ -750,16 +750,17 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) /* Use this call to set the current regulatory domain. Conflicts with * multiple drivers can be ironed out later. Caller must've already - * kmalloc'd the rd structure. If this calls fails you should kfree() - * the passed rd. Caller must hold cfg80211_drv_mutex */ + * kmalloc'd the rd structure. Caller must hold cfg80211_drv_mutex */ int set_regdom(const struct ieee80211_regdomain *rd) { int r; /* Note that this doesn't update the wiphys, this is done below */ r = __set_regdom(rd); - if (r) + if (r) { + kfree(rd); return r; + } /* This would make this whole thing pointless */ BUG_ON(rd != cfg80211_regdomain); -- cgit v1.2.3-70-g09d2 From be3d48106c1e5d075784e5e67928a6b5ffc0f3b6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Oct 2008 20:32:21 +0200 Subject: wireless: remove struct regdom hinting The code needs to be split out and cleaned up, so as a first step remove the capability, to add it back in a subsequent patch as a separate function. Also remove the publically facing return value of the function and the wiphy argument. A number of internal functions go from being generic helpers to just being used for alpha2 setting. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- Documentation/networking/regulatory.txt | 11 +++--- drivers/net/wireless/zd1211rw/zd_mac.c | 2 +- include/net/wireless.h | 23 +++--------- net/wireless/nl80211.c | 2 +- net/wireless/reg.c | 63 +++++++++------------------------ net/wireless/reg.h | 23 ++++-------- 6 files changed, 36 insertions(+), 88 deletions(-) (limited to 'Documentation/networking') diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt index 357d4ba4f13..dcf31648414 100644 --- a/Documentation/networking/regulatory.txt +++ b/Documentation/networking/regulatory.txt @@ -131,11 +131,13 @@ are expected to do this during initialization. r = zd_reg2alpha2(mac->regdomain, alpha2); if (!r) - regulatory_hint(hw->wiphy, alpha2, NULL); + regulatory_hint(hw->wiphy, alpha2); Example code - drivers providing a built in regulatory domain: -------------------------------------------------------------- +[NOTE: This API is not currently available, it can be added when required] + If you have regulatory information you can obtain from your driver and you *need* to use this we let you build a regulatory domain structure and pass it to the wireless core. To do this you should @@ -182,6 +184,7 @@ Then in some part of your code after your wiphy has been registered: memcpy(rd, &mydriver_jp_regdom, sizeof(struct ieee80211_regdomain)); for (i=0; i < num_rules; i++) - memcpy(&rd->reg_rules[i], &mydriver_jp_regdom.reg_rules[i], - sizeof(struct ieee80211_reg_rule)); - return regulatory_hint(hw->wiphy, NULL, rd); + memcpy(&rd->reg_rules[i], + &mydriver_jp_regdom.reg_rules[i], + sizeof(struct ieee80211_reg_rule)); + regulatory_struct_hint(rd); diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c index 2f0802b29c4..07513e48b8f 100644 --- a/drivers/net/wireless/zd1211rw/zd_mac.c +++ b/drivers/net/wireless/zd1211rw/zd_mac.c @@ -171,7 +171,7 @@ int zd_mac_init_hw(struct ieee80211_hw *hw) r = zd_reg2alpha2(mac->regdomain, alpha2); if (!r) - regulatory_hint(hw->wiphy, alpha2, NULL); + regulatory_hint(hw->wiphy, alpha2); r = 0; disable_int: diff --git a/include/net/wireless.h b/include/net/wireless.h index 6e3ea015904..41294c5f6f8 100644 --- a/include/net/wireless.h +++ b/include/net/wireless.h @@ -342,34 +342,19 @@ ieee80211_get_channel(struct wiphy *wiphy, int freq) /** * regulatory_hint - driver hint to the wireless core a regulatory domain - * @wiphy: the driver's very own &struct wiphy + * @wiphy: the wireless device giving the hint (used only for reporting + * conflicts) * @alpha2: the ISO/IEC 3166 alpha2 the driver claims its regulatory domain * should be in. If @rd is set this should be NULL. Note that if you * set this to NULL you should still set rd->alpha2 to some accepted * alpha2. - * @rd: a complete regulatory domain provided by the driver. If passed - * the driver does not need to worry about freeing it. * * Wireless drivers can use this function to hint to the wireless core * what it believes should be the current regulatory domain by * giving it an ISO/IEC 3166 alpha2 country code it knows its regulatory * domain should be in or by providing a completely build regulatory domain. * If the driver provides an ISO/IEC 3166 alpha2 userspace will be queried - * for a regulatory domain structure for the respective country. If - * a regulatory domain is build and passed you should set the alpha2 - * if possible, otherwise set it to the special value of "99" which tells - * the wireless core it is unknown. - * - * Returns -EALREADY if *a regulatory domain* has already been set. Note that - * this could be by another driver. It is safe for drivers to continue if - * -EALREADY is returned, if drivers are not capable of world roaming they - * should not register more channels than they support. Right now we only - * support listening to the first driver hint. If the driver is capable - * of world roaming but wants to respect its own EEPROM mappings for - * specific regulatory domains it should register the @reg_notifier callback - * on the &struct wiphy. Returns 0 if the hint went through fine or through an - * intersection operation. Otherwise a standard error code is returned. + * for a regulatory domain structure for the respective country. */ -extern int regulatory_hint(struct wiphy *wiphy, - const char *alpha2, struct ieee80211_regdomain *rd); +extern void regulatory_hint(struct wiphy *wiphy, const char *alpha2); #endif /* __NET_WIRELESS_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f82cc9aa690..5e1d658a8b5 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1695,7 +1695,7 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) return -EINVAL; #endif mutex_lock(&cfg80211_drv_mutex); - r = __regulatory_hint(NULL, REGDOM_SET_BY_USER, data, NULL); + r = __regulatory_hint(NULL, REGDOM_SET_BY_USER, data); mutex_unlock(&cfg80211_drv_mutex); return r; } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 038f8f133c5..dc10071deaa 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -42,7 +42,10 @@ #include "core.h" #include "reg.h" -/* wiphy is set if this request's initiator is REGDOM_SET_BY_DRIVER */ +/* + * wiphy is set if this request's initiator is + * REGDOM_SET_BY_COUNTRY_IE or _DRIVER + */ struct regulatory_request { struct wiphy *wiphy; enum reg_set_by initiator; @@ -298,7 +301,7 @@ static int call_crda(const char *alpha2) /* This has the logic which determines when a new request * should be ignored. */ static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by, - char *alpha2, struct ieee80211_regdomain *rd) + const char *alpha2) { /* All initial requests are respected */ if (!last_request) @@ -343,22 +346,8 @@ static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by, return 1; case REGDOM_SET_BY_DRIVER: BUG_ON(!wiphy); - if (last_request->initiator == REGDOM_SET_BY_DRIVER) { - /* Two separate drivers hinting different things, - * this is possible if you have two devices present - * on a system with different EEPROM regulatory - * readings. XXX: Do intersection, we support only - * the first regulatory hint for now */ - if (last_request->wiphy != wiphy) - return -EALREADY; - if (rd) - return -EALREADY; - /* Driver should not be trying to hint different - * regulatory domains! */ - BUG_ON(!alpha2_equal(alpha2, - cfg80211_regdomain->alpha2)); + if (last_request->initiator == REGDOM_SET_BY_DRIVER) return -EALREADY; - } if (last_request->initiator == REGDOM_SET_BY_CORE) return 0; /* XXX: Handle intersection, and add the @@ -557,40 +546,32 @@ void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby) /* Caller must hold &cfg80211_drv_mutex */ int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by, - const char *alpha2, struct ieee80211_regdomain *rd) + const char *alpha2) { struct regulatory_request *request; - char *rd_alpha2; int r = 0; - r = ignore_request(wiphy, set_by, (char *) alpha2, rd); + r = ignore_request(wiphy, set_by, alpha2); if (r) return r; - if (rd) - rd_alpha2 = rd->alpha2; - else - rd_alpha2 = (char *) alpha2; - switch (set_by) { case REGDOM_SET_BY_CORE: case REGDOM_SET_BY_COUNTRY_IE: case REGDOM_SET_BY_DRIVER: case REGDOM_SET_BY_USER: request = kzalloc(sizeof(struct regulatory_request), - GFP_KERNEL); + GFP_KERNEL); if (!request) return -ENOMEM; - request->alpha2[0] = rd_alpha2[0]; - request->alpha2[1] = rd_alpha2[1]; + request->alpha2[0] = alpha2[0]; + request->alpha2[1] = alpha2[1]; request->initiator = set_by; request->wiphy = wiphy; kfree(last_request); last_request = request; - if (rd) - break; r = call_crda(alpha2); #ifndef CONFIG_WIRELESS_OLD_REGULATORY if (r) @@ -605,25 +586,13 @@ int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by, return r; } -int regulatory_hint(struct wiphy *wiphy, const char *alpha2, - struct ieee80211_regdomain *rd) +void regulatory_hint(struct wiphy *wiphy, const char *alpha2) { - int r; - BUG_ON(!rd && !alpha2); + BUG_ON(!alpha2); mutex_lock(&cfg80211_drv_mutex); - - r = __regulatory_hint(wiphy, REGDOM_SET_BY_DRIVER, alpha2, rd); - if (r || !rd) - goto unlock_and_exit; - - /* If the driver passed a regulatory domain we skipped asking - * userspace for one so we can now go ahead and set it */ - r = set_regdom(rd); - -unlock_and_exit: + __regulatory_hint(wiphy, REGDOM_SET_BY_DRIVER, alpha2); mutex_unlock(&cfg80211_drv_mutex); - return r; } EXPORT_SYMBOL(regulatory_hint); @@ -792,11 +761,11 @@ int regulatory_init(void) * that is not a valid ISO / IEC 3166 alpha2 */ if (ieee80211_regdom[0] != 'E' || ieee80211_regdom[1] != 'U') err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, - ieee80211_regdom, NULL); + ieee80211_regdom); #else cfg80211_regdomain = cfg80211_world_regdom; - err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00", NULL); + err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00"); if (err) printk(KERN_ERR "cfg80211: calling CRDA failed - " "unable to update world regulatory domain, " diff --git a/net/wireless/reg.h b/net/wireless/reg.h index 0c1572b92fe..c9b6b6358bb 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -11,30 +11,21 @@ int set_regdom(const struct ieee80211_regdomain *rd); /** * __regulatory_hint - hint to the wireless core a regulatory domain - * @wiphy: if a driver is providing the hint this is the driver's very - * own &struct wiphy + * @wiphy: if the hint comes from country information from an AP, this + * is required to be set to the wiphy that received the information * @alpha2: the ISO/IEC 3166 alpha2 being claimed the regulatory domain - * should be in. If @rd is set this should be NULL - * @rd: a complete regulatory domain, if passed the caller need not worry - * about freeing it + * should be in. * * The Wireless subsystem can use this function to hint to the wireless core * what it believes should be the current regulatory domain by * giving it an ISO/IEC 3166 alpha2 country code it knows its regulatory - * domain should be in or by providing a completely build regulatory domain. + * domain should be in. * - * Returns -EALREADY if *a regulatory domain* has already been set. Note that - * this could be by another driver. It is safe for drivers to continue if - * -EALREADY is returned, if drivers are not capable of world roaming they - * should not register more channels than they support. Right now we only - * support listening to the first driver hint. If the driver is capable - * of world roaming but wants to respect its own EEPROM mappings for - * specific regulatory domains it should register the @reg_notifier callback - * on the &struct wiphy. Returns 0 if the hint went through fine or through an - * intersection operation. Otherwise a standard error code is returned. + * Returns zero if all went fine, %-EALREADY if a regulatory domain had + * already been set or other standard error codes. * */ extern int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by, - const char *alpha2, struct ieee80211_regdomain *rd); + const char *alpha2); #endif /* __NET_WIRELESS_REG_H */ -- cgit v1.2.3-70-g09d2 From 3f8b4b13785c2737413d3241c21c7c86a41535ef Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Wed, 22 Oct 2008 11:19:48 +0000 Subject: bonding: update docs to correctly reflect arp_ip_target behavior This documentation patch hopes to clarify that the '+' was only needed for Fedora 7 and Red Hat Enterprise Linux 5.0 and 5.1. After that the IP addreses could be added as a comma separated list just like the module option. Signed-off-by: Andy Gospodarek Signed-off-by: Jeff Garzik --- Documentation/networking/bonding.txt | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'Documentation/networking') diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index 688dfe1e6b7..d733a428eff 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt @@ -922,17 +922,19 @@ USERCTL=no NETMASK, NETWORK and BROADCAST) to match your network configuration. For later versions of initscripts, such as that found with Fedora -7 and Red Hat Enterprise Linux version 5 (or later), it is possible, and, -indeed, preferable, to specify the bonding options in the ifcfg-bond0 +7 (or later) and Red Hat Enterprise Linux version 5 (or later), it is possible, +and, indeed, preferable, to specify the bonding options in the ifcfg-bond0 file, e.g. a line of the format: -BONDING_OPTS="mode=active-backup arp_interval=60 arp_ip_target=+192.168.1.254" +BONDING_OPTS="mode=active-backup arp_interval=60 arp_ip_target=192.168.1.254" will configure the bond with the specified options. The options specified in BONDING_OPTS are identical to the bonding module parameters -except for the arp_ip_target field. Each target should be included as a -separate option and should be preceded by a '+' to indicate it should be -added to the list of queried targets, e.g., +except for the arp_ip_target field when using versions of initscripts older +than and 8.57 (Fedora 8) and 8.45.19 (Red Hat Enterprise Linux 5.2). When +using older versions each target should be included as a separate option and +should be preceded by a '+' to indicate it should be added to the list of +queried targets, e.g., arp_ip_target=+192.168.1.1 arp_ip_target=+192.168.1.2 @@ -940,7 +942,7 @@ added to the list of queried targets, e.g., options via BONDING_OPTS, it is not necessary to edit /etc/modules.conf or /etc/modprobe.conf. - For older versions of initscripts that do not support + For even older versions of initscripts that do not support BONDING_OPTS, it is necessary to edit /etc/modules.conf (or /etc/modprobe.conf, depending upon your distro) to load the bonding module with your desired options when the bond0 interface is brought up. The -- cgit v1.2.3-70-g09d2 From 305d552accae6afb859c493ebc7d98ca3371dae2 Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Tue, 4 Nov 2008 17:51:14 -0800 Subject: bonding: send IPv6 neighbor advertisement on failover This patch adds better IPv6 failover support for bonding devices, especially when in active-backup mode and there are only IPv6 addresses configured, as reported by Alex Sidorenko. - Creates a new file, net/drivers/bonding/bond_ipv6.c, for the IPv6-specific routines. Both regular bonds and VLANs over bonds are supported. - Adds a new tunable, num_unsol_na, to limit the number of unsolicited IPv6 Neighbor Advertisements that are sent on a failover event. Default is 1. - Creates two new IPv6 neighbor discovery functions: ndisc_build_skb() ndisc_send_skb() These were required to support VLANs since we have to be able to add the VLAN id to the skb since ndisc_send_na() and friends shouldn't be asked to do this. These two routines are basically __ndisc_send() split into two pieces, in a slightly different order. - Updates Documentation/networking/bonding.txt and bumps the rev of bond support to 3.4.0. On failover, this new code will generate one packet: - An unsolicited IPv6 Neighbor Advertisement, which helps the switch learn that the address has moved to the new slave. Testing has shown that sending just the NA results in pretty good behavior when in active-back mode, I saw no lost ping packets for example. Signed-off-by: Brian Haley Signed-off-by: Jay Vosburgh Signed-off-by: Jeff Garzik --- Documentation/networking/bonding.txt | 10 ++ drivers/net/Kconfig | 1 + drivers/net/bonding/Makefile | 3 + drivers/net/bonding/bond_ipv6.c | 218 +++++++++++++++++++++++++++++++++++ drivers/net/bonding/bond_main.c | 33 +++++- drivers/net/bonding/bond_sysfs.c | 42 +++++++ drivers/net/bonding/bonding.h | 34 +++++- include/net/ndisc.h | 14 +++ net/ipv6/ndisc.c | 92 ++++++++++----- 9 files changed, 416 insertions(+), 31 deletions(-) create mode 100644 drivers/net/bonding/bond_ipv6.c (limited to 'Documentation/networking') diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index d733a428eff..3f4d0fae708 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt @@ -551,6 +551,16 @@ num_grat_arp affects only the active-backup mode. This option was added for bonding version 3.3.0. +num_unsol_na + + Specifies the number of unsolicited IPv6 Neighbor Advertisements + to be issued after a failover event. One unsolicited NA is issued + immediately after the failover. + + The valid range is 0 - 255; the default value is 1. This option + affects only the active-backup mode. This option was added for + bonding version 3.4.0. + primary A string (eth0, eth2, etc) specifying which slave is the diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 0f3e6b2d280..f1d0a137169 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -61,6 +61,7 @@ config DUMMY config BONDING tristate "Bonding driver support" depends on INET + depends on IPV6 || IPV6=n ---help--- Say 'Y' or 'M' if you wish to be able to 'bond' multiple Ethernet Channels together. This is called 'Etherchannel' by Cisco, diff --git a/drivers/net/bonding/Makefile b/drivers/net/bonding/Makefile index 5cdae2bc055..6f9c6faef24 100644 --- a/drivers/net/bonding/Makefile +++ b/drivers/net/bonding/Makefile @@ -6,3 +6,6 @@ obj-$(CONFIG_BONDING) += bonding.o bonding-objs := bond_main.o bond_3ad.o bond_alb.o bond_sysfs.o +ipv6-$(subst m,y,$(CONFIG_IPV6)) += bond_ipv6.o +bonding-objs += $(ipv6-y) + diff --git a/drivers/net/bonding/bond_ipv6.c b/drivers/net/bonding/bond_ipv6.c new file mode 100644 index 00000000000..7c78b7bf671 --- /dev/null +++ b/drivers/net/bonding/bond_ipv6.c @@ -0,0 +1,218 @@ +/* + * Copyright(c) 2008 Hewlett-Packard Development Company, L.P. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called LICENSE. + * + */ + +//#define BONDING_DEBUG 1 + +#include +#include +#include +#include +#include +#include "bonding.h" + +/* + * Assign bond->master_ipv6 to the next IPv6 address in the list, or + * zero it out if there are none. + */ +static void bond_glean_dev_ipv6(struct net_device *dev, struct in6_addr *addr) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + + if (!dev) + return; + + idev = in6_dev_get(dev); + if (!idev) + return; + + read_lock_bh(&idev->lock); + ifa = idev->addr_list; + if (ifa) + ipv6_addr_copy(addr, &ifa->addr); + else + ipv6_addr_set(addr, 0, 0, 0, 0); + + read_unlock_bh(&idev->lock); + + in6_dev_put(idev); +} + +static void bond_na_send(struct net_device *slave_dev, + struct in6_addr *daddr, + int router, + unsigned short vlan_id) +{ + struct in6_addr mcaddr; + struct icmp6hdr icmp6h = { + .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, + }; + struct sk_buff *skb; + + icmp6h.icmp6_router = router; + icmp6h.icmp6_solicited = 0; + icmp6h.icmp6_override = 1; + + addrconf_addr_solict_mult(daddr, &mcaddr); + + dprintk("ipv6 na on slave %s: dest %pI6, src %pI6\n", + slave->name, &mcaddr, daddr); + + skb = ndisc_build_skb(slave_dev, &mcaddr, daddr, &icmp6h, daddr, + ND_OPT_TARGET_LL_ADDR); + + if (!skb) { + printk(KERN_ERR DRV_NAME ": NA packet allocation failed\n"); + return; + } + + if (vlan_id) { + skb = vlan_put_tag(skb, vlan_id); + if (!skb) { + printk(KERN_ERR DRV_NAME ": failed to insert VLAN tag\n"); + return; + } + } + + ndisc_send_skb(skb, slave_dev, NULL, &mcaddr, daddr, &icmp6h); +} + +/* + * Kick out an unsolicited Neighbor Advertisement for an IPv6 address on + * the bonding master. This will help the switch learn our address + * if in active-backup mode. + * + * Caller must hold curr_slave_lock for read or better + */ +void bond_send_unsolicited_na(struct bonding *bond) +{ + struct slave *slave = bond->curr_active_slave; + struct vlan_entry *vlan; + struct inet6_dev *idev; + int is_router; + + dprintk("bond_send_unsol_na: bond %s slave %s\n", bond->dev->name, + slave ? slave->dev->name : "NULL"); + + if (!slave || !bond->send_unsol_na || + test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state)) + return; + + bond->send_unsol_na--; + + idev = in6_dev_get(bond->dev); + if (!idev) + return; + + is_router = !!idev->cnf.forwarding; + + in6_dev_put(idev); + + if (!ipv6_addr_any(&bond->master_ipv6)) + bond_na_send(slave->dev, &bond->master_ipv6, is_router, 0); + + list_for_each_entry(vlan, &bond->vlan_list, vlan_list) { + if (!ipv6_addr_any(&vlan->vlan_ipv6)) { + bond_na_send(slave->dev, &vlan->vlan_ipv6, is_router, + vlan->vlan_id); + } + } +} + +/* + * bond_inet6addr_event: handle inet6addr notifier chain events. + * + * We keep track of device IPv6 addresses primarily to use as source + * addresses in NS probes. + * + * We track one IPv6 for the main device (if it has one). + */ +static int bond_inet6addr_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct inet6_ifaddr *ifa = ptr; + struct net_device *vlan_dev, *event_dev = ifa->idev->dev; + struct bonding *bond; + struct vlan_entry *vlan; + + if (dev_net(event_dev) != &init_net) + return NOTIFY_DONE; + + list_for_each_entry(bond, &bond_dev_list, bond_list) { + if (bond->dev == event_dev) { + switch (event) { + case NETDEV_UP: + if (ipv6_addr_any(&bond->master_ipv6)) + ipv6_addr_copy(&bond->master_ipv6, + &ifa->addr); + return NOTIFY_OK; + case NETDEV_DOWN: + if (ipv6_addr_equal(&bond->master_ipv6, + &ifa->addr)) + bond_glean_dev_ipv6(bond->dev, + &bond->master_ipv6); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } + } + + list_for_each_entry(vlan, &bond->vlan_list, vlan_list) { + vlan_dev = vlan_group_get_device(bond->vlgrp, + vlan->vlan_id); + if (vlan_dev == event_dev) { + switch (event) { + case NETDEV_UP: + if (ipv6_addr_any(&vlan->vlan_ipv6)) + ipv6_addr_copy(&vlan->vlan_ipv6, + &ifa->addr); + return NOTIFY_OK; + case NETDEV_DOWN: + if (ipv6_addr_equal(&vlan->vlan_ipv6, + &ifa->addr)) + bond_glean_dev_ipv6(vlan_dev, + &vlan->vlan_ipv6); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } + } + } + } + return NOTIFY_DONE; +} + +static struct notifier_block bond_inet6addr_notifier = { + .notifier_call = bond_inet6addr_event, +}; + +void bond_register_ipv6_notifier(void) +{ + register_inet6addr_notifier(&bond_inet6addr_notifier); +} + +void bond_unregister_ipv6_notifier(void) +{ + unregister_inet6addr_notifier(&bond_inet6addr_notifier); +} + diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 39575d76497..798d98ce2d9 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -89,6 +89,7 @@ static int max_bonds = BOND_DEFAULT_MAX_BONDS; static int num_grat_arp = 1; +static int num_unsol_na = 1; static int miimon = BOND_LINK_MON_INTERV; static int updelay = 0; static int downdelay = 0; @@ -107,6 +108,8 @@ module_param(max_bonds, int, 0); MODULE_PARM_DESC(max_bonds, "Max number of bonded devices"); module_param(num_grat_arp, int, 0644); MODULE_PARM_DESC(num_grat_arp, "Number of gratuitous ARP packets to send on failover event"); +module_param(num_unsol_na, int, 0644); +MODULE_PARM_DESC(num_unsol_na, "Number of unsolicited IPv6 Neighbor Advertisements packets to send on failover event"); module_param(miimon, int, 0); MODULE_PARM_DESC(miimon, "Link check interval in milliseconds"); module_param(updelay, int, 0); @@ -242,14 +245,13 @@ static int bond_add_vlan(struct bonding *bond, unsigned short vlan_id) dprintk("bond: %s, vlan id %d\n", (bond ? bond->dev->name: "None"), vlan_id); - vlan = kmalloc(sizeof(struct vlan_entry), GFP_KERNEL); + vlan = kzalloc(sizeof(struct vlan_entry), GFP_KERNEL); if (!vlan) { return -ENOMEM; } INIT_LIST_HEAD(&vlan->vlan_list); vlan->vlan_id = vlan_id; - vlan->vlan_ip = 0; write_lock_bh(&bond->lock); @@ -1208,6 +1210,9 @@ void bond_change_active_slave(struct bonding *bond, struct slave *new_active) bond->send_grat_arp = bond->params.num_grat_arp; bond_send_gratuitous_arp(bond); + bond->send_unsol_na = bond->params.num_unsol_na; + bond_send_unsolicited_na(bond); + write_unlock_bh(&bond->curr_slave_lock); read_unlock(&bond->lock); @@ -2463,6 +2468,12 @@ void bond_mii_monitor(struct work_struct *work) read_unlock(&bond->curr_slave_lock); } + if (bond->send_unsol_na) { + read_lock(&bond->curr_slave_lock); + bond_send_unsolicited_na(bond); + read_unlock(&bond->curr_slave_lock); + } + if (bond_miimon_inspect(bond)) { read_unlock(&bond->lock); rtnl_lock(); @@ -3158,6 +3169,12 @@ void bond_activebackup_arp_mon(struct work_struct *work) read_unlock(&bond->curr_slave_lock); } + if (bond->send_unsol_na) { + read_lock(&bond->curr_slave_lock); + bond_send_unsolicited_na(bond); + read_unlock(&bond->curr_slave_lock); + } + if (bond_ab_arp_inspect(bond, delta_in_ticks)) { read_unlock(&bond->lock); rtnl_lock(); @@ -3827,6 +3844,7 @@ static int bond_close(struct net_device *bond_dev) write_lock_bh(&bond->lock); bond->send_grat_arp = 0; + bond->send_unsol_na = 0; /* signal timers not to re-arm */ bond->kill_timers = 1; @@ -4542,6 +4560,7 @@ static int bond_init(struct net_device *bond_dev, struct bond_params *params) bond->primary_slave = NULL; bond->dev = bond_dev; bond->send_grat_arp = 0; + bond->send_unsol_na = 0; bond->setup_by_slave = 0; INIT_LIST_HEAD(&bond->vlan_list); @@ -4791,6 +4810,13 @@ static int bond_check_params(struct bond_params *params) num_grat_arp = 1; } + if (num_unsol_na < 0 || num_unsol_na > 255) { + printk(KERN_WARNING DRV_NAME + ": Warning: num_unsol_na (%d) not in range 0-255 so it " + "was reset to 1 \n", num_unsol_na); + num_unsol_na = 1; + } + /* reset values for 802.3ad */ if (bond_mode == BOND_MODE_8023AD) { if (!miimon) { @@ -4992,6 +5018,7 @@ static int bond_check_params(struct bond_params *params) params->xmit_policy = xmit_hashtype; params->miimon = miimon; params->num_grat_arp = num_grat_arp; + params->num_unsol_na = num_unsol_na; params->arp_interval = arp_interval; params->arp_validate = arp_validate_value; params->updelay = updelay; @@ -5144,6 +5171,7 @@ static int __init bonding_init(void) register_netdevice_notifier(&bond_netdev_notifier); register_inetaddr_notifier(&bond_inetaddr_notifier); + bond_register_ipv6_notifier(); goto out; err: @@ -5166,6 +5194,7 @@ static void __exit bonding_exit(void) { unregister_netdevice_notifier(&bond_netdev_notifier); unregister_inetaddr_notifier(&bond_inetaddr_notifier); + bond_unregister_ipv6_notifier(); bond_destroy_sysfs(); diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index e400d7dfdfc..8788e3e3385 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -983,6 +983,47 @@ out: return ret; } static DEVICE_ATTR(num_grat_arp, S_IRUGO | S_IWUSR, bonding_show_n_grat_arp, bonding_store_n_grat_arp); + +/* + * Show and set the number of unsolicted NA's to send after a failover event. + */ +static ssize_t bonding_show_n_unsol_na(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct bonding *bond = to_bond(d); + + return sprintf(buf, "%d\n", bond->params.num_unsol_na); +} + +static ssize_t bonding_store_n_unsol_na(struct device *d, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int new_value, ret = count; + struct bonding *bond = to_bond(d); + + if (sscanf(buf, "%d", &new_value) != 1) { + printk(KERN_ERR DRV_NAME + ": %s: no num_unsol_na value specified.\n", + bond->dev->name); + ret = -EINVAL; + goto out; + } + if (new_value < 0 || new_value > 255) { + printk(KERN_ERR DRV_NAME + ": %s: Invalid num_unsol_na value %d not in range 0-255; rejected.\n", + bond->dev->name, new_value); + ret = -EINVAL; + goto out; + } else { + bond->params.num_unsol_na = new_value; + } +out: + return ret; +} +static DEVICE_ATTR(num_unsol_na, S_IRUGO | S_IWUSR, bonding_show_n_unsol_na, bonding_store_n_unsol_na); + /* * Show and set the MII monitor interval. There are two tricky bits * here. First, if MII monitoring is activated, then we must disable @@ -1420,6 +1461,7 @@ static struct attribute *per_bond_attrs[] = { &dev_attr_lacp_rate.attr, &dev_attr_xmit_hash_policy.attr, &dev_attr_num_grat_arp.attr, + &dev_attr_num_unsol_na.attr, &dev_attr_miimon.attr, &dev_attr_primary.attr, &dev_attr_use_carrier.attr, diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index ffb668dd6d3..0491c7c2645 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h @@ -19,16 +19,19 @@ #include #include #include +#include #include "bond_3ad.h" #include "bond_alb.h" -#define DRV_VERSION "3.3.0" -#define DRV_RELDATE "June 10, 2008" +#define DRV_VERSION "3.4.0" +#define DRV_RELDATE "October 7, 2008" #define DRV_NAME "bonding" #define DRV_DESCRIPTION "Ethernet Channel Bonding Driver" #define BOND_MAX_ARP_TARGETS 16 +extern struct list_head bond_dev_list; + #ifdef BONDING_DEBUG #define dprintk(fmt, args...) \ printk(KERN_DEBUG \ @@ -126,6 +129,7 @@ struct bond_params { int xmit_policy; int miimon; int num_grat_arp; + int num_unsol_na; int arp_interval; int arp_validate; int use_carrier; @@ -148,6 +152,9 @@ struct vlan_entry { struct list_head vlan_list; __be32 vlan_ip; unsigned short vlan_id; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct in6_addr vlan_ipv6; +#endif }; struct slave { @@ -195,6 +202,7 @@ struct bonding { rwlock_t curr_slave_lock; s8 kill_timers; s8 send_grat_arp; + s8 send_unsol_na; s8 setup_by_slave; struct net_device_stats stats; #ifdef CONFIG_PROC_FS @@ -218,6 +226,9 @@ struct bonding { struct delayed_work arp_work; struct delayed_work alb_work; struct delayed_work ad_work; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct in6_addr master_ipv6; +#endif }; /** @@ -341,5 +352,24 @@ extern struct bond_parm_tbl xmit_hashtype_tbl[]; extern struct bond_parm_tbl arp_validate_tbl[]; extern struct bond_parm_tbl fail_over_mac_tbl[]; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +void bond_send_unsolicited_na(struct bonding *bond); +void bond_register_ipv6_notifier(void); +void bond_unregister_ipv6_notifier(void); +#else +static inline void bond_send_unsolicited_na(struct bonding *bond) +{ + return; +} +static inline void bond_register_ipv6_notifier(void) +{ + return; +} +static inline void bond_unregister_ipv6_notifier(void) +{ + return; +} +#endif + #endif /* _LINUX_BONDING_H */ diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 11dd0137c6a..ce532f2222c 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -108,6 +108,20 @@ extern void ndisc_send_redirect(struct sk_buff *skb, extern int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int dir); +extern struct sk_buff *ndisc_build_skb(struct net_device *dev, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + struct icmp6hdr *icmp6h, + const struct in6_addr *target, + int llinfo); + +extern void ndisc_send_skb(struct sk_buff *skb, + struct net_device *dev, + struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + struct icmp6hdr *icmp6h); + /* diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 2a6752dae09..fbf451c0d77 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -437,38 +437,20 @@ static void pndisc_destructor(struct pneigh_entry *n) ipv6_dev_mc_dec(dev, &maddr); } -/* - * Send a Neighbour Advertisement - */ -static void __ndisc_send(struct net_device *dev, - struct neighbour *neigh, - const struct in6_addr *daddr, - const struct in6_addr *saddr, - struct icmp6hdr *icmp6h, const struct in6_addr *target, - int llinfo) +struct sk_buff *ndisc_build_skb(struct net_device *dev, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + struct icmp6hdr *icmp6h, + const struct in6_addr *target, + int llinfo) { - struct flowi fl; - struct dst_entry *dst; struct net *net = dev_net(dev); struct sock *sk = net->ipv6.ndisc_sk; struct sk_buff *skb; struct icmp6hdr *hdr; - struct inet6_dev *idev; int len; int err; - u8 *opt, type; - - type = icmp6h->icmp6_type; - - icmpv6_flow_init(sk, &fl, type, saddr, daddr, dev->ifindex); - - dst = icmp6_dst_alloc(dev, neigh, daddr); - if (!dst) - return; - - err = xfrm_lookup(&dst, &fl, NULL, 0); - if (err < 0) - return; + u8 *opt; if (!dev->addr_len) llinfo = 0; @@ -485,8 +467,7 @@ static void __ndisc_send(struct net_device *dev, ND_PRINTK0(KERN_ERR "ICMPv6 ND: %s() failed to allocate an skb.\n", __func__); - dst_release(dst); - return; + return NULL; } skb_reserve(skb, LL_RESERVED_SPACE(dev)); @@ -513,6 +494,42 @@ static void __ndisc_send(struct net_device *dev, csum_partial((__u8 *) hdr, len, 0)); + return skb; +} + +EXPORT_SYMBOL(ndisc_build_skb); + +void ndisc_send_skb(struct sk_buff *skb, + struct net_device *dev, + struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + struct icmp6hdr *icmp6h) +{ + struct flowi fl; + struct dst_entry *dst; + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.ndisc_sk; + struct inet6_dev *idev; + int err; + u8 type; + + type = icmp6h->icmp6_type; + + icmpv6_flow_init(sk, &fl, type, saddr, daddr, dev->ifindex); + + dst = icmp6_dst_alloc(dev, neigh, daddr); + if (!dst) { + kfree_skb(skb); + return; + } + + err = xfrm_lookup(&dst, &fl, NULL, 0); + if (err < 0) { + kfree_skb(skb); + return; + } + skb->dst = dst; idev = in6_dev_get(dst->dev); @@ -529,6 +546,27 @@ static void __ndisc_send(struct net_device *dev, in6_dev_put(idev); } +EXPORT_SYMBOL(ndisc_send_skb); + +/* + * Send a Neighbour Discover packet + */ +static void __ndisc_send(struct net_device *dev, + struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + struct icmp6hdr *icmp6h, const struct in6_addr *target, + int llinfo) +{ + struct sk_buff *skb; + + skb = ndisc_build_skb(dev, daddr, saddr, icmp6h, target, llinfo); + if (!skb) + return; + + ndisc_send_skb(skb, dev, neigh, daddr, saddr, icmp6h); +} + static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, -- cgit v1.2.3-70-g09d2 From fd989c83325cb34795bc4d4aa6b13c06f90eac99 Mon Sep 17 00:00:00 2001 From: Jay Vosburgh Date: Tue, 4 Nov 2008 17:51:16 -0800 Subject: bonding: alternate agg selection policies for 802.3ad This patch implements alternative aggregator selection policies for 802.3ad. The existing policy, now termed "stable," selects the active aggregator by greatest bandwidth, and only reselects a new aggregator if the active aggregator is entirely disabled (no more ports or all ports down). This patch adds two new policies: bandwidth and count, selecting the active aggregator by total bandwidth (like the stable policy) or by the number of ports in the aggregator, respectively. These two policies also differ from the stable policy in that they will reselect the active aggregator when availability-related changes occur in the bond (e.g., link state change). This permits "gang failover" within 802.3ad, allowing redundant aggregators along parallel paths to always maintain the "best" aggregator as the active aggregator (rather than having to wait for the active to entirely fail). This patch also updates the driver version to 3.5.0. Signed-off-by: Jay Vosburgh Signed-off-by: Jeff Garzik --- Documentation/networking/bonding.txt | 42 +++++ drivers/net/bonding/bond_3ad.c | 326 ++++++++++++++++++++++------------- drivers/net/bonding/bond_3ad.h | 10 +- drivers/net/bonding/bond_main.c | 30 ++++ drivers/net/bonding/bond_sysfs.c | 49 ++++++ drivers/net/bonding/bonding.h | 5 +- 6 files changed, 333 insertions(+), 129 deletions(-) (limited to 'Documentation/networking') diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index 3f4d0fae708..5ede7473b42 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt @@ -194,6 +194,48 @@ or, for backwards compatibility, the option value. E.g., The parameters are as follows: +ad_select + + Specifies the 802.3ad aggregation selection logic to use. The + possible values and their effects are: + + stable or 0 + + The active aggregator is chosen by largest aggregate + bandwidth. + + Reselection of the active aggregator occurs only when all + slaves of the active aggregator are down or the active + aggregator has no slaves. + + This is the default value. + + bandwidth or 1 + + The active aggregator is chosen by largest aggregate + bandwidth. Reselection occurs if: + + - A slave is added to or removed from the bond + + - Any slave's link state changes + + - Any slave's 802.3ad association state changes + + - The bond's adminstrative state changes to up + + count or 2 + + The active aggregator is chosen by the largest number of + ports (slaves). Reselection occurs as described under the + "bandwidth" setting, above. + + The bandwidth and count selection policies permit failover of + 802.3ad aggregations when partial failure of the active aggregator + occurs. This keeps the aggregator with the highest availability + (either in bandwidth or in number of ports) active at all times. + + This option was added in bonding version 3.4.0. + arp_interval Specifies the ARP link monitoring frequency in milliseconds. diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 6106660a4a4..ba1372f2f14 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -236,6 +237,17 @@ static inline struct aggregator *__get_next_agg(struct aggregator *aggregator) return &(SLAVE_AD_INFO(slave->next).aggregator); } +/* + * __agg_has_partner + * + * Return nonzero if aggregator has a partner (denoted by a non-zero ether + * address for the partner). Return 0 if not. + */ +static inline int __agg_has_partner(struct aggregator *agg) +{ + return !is_zero_ether_addr(agg->partner_system.mac_addr_value); +} + /** * __disable_port - disable the port's slave * @port: the port we're looking at @@ -274,14 +286,14 @@ static inline int __port_is_enabled(struct port *port) * __get_agg_selection_mode - get the aggregator selection mode * @port: the port we're looking at * - * Get the aggregator selection mode. Can be %BANDWIDTH or %COUNT. + * Get the aggregator selection mode. Can be %STABLE, %BANDWIDTH or %COUNT. */ static inline u32 __get_agg_selection_mode(struct port *port) { struct bonding *bond = __get_bond_by_port(port); if (bond == NULL) { - return AD_BANDWIDTH; + return BOND_AD_STABLE; } return BOND_AD_INFO(bond).agg_select_mode; @@ -1414,9 +1426,82 @@ static void ad_port_selection_logic(struct port *port) // else set ready=FALSE in all aggregator's ports __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); - if (!__check_agg_selection_timer(port) && (aggregator = __get_first_agg(port))) { - ad_agg_selection_logic(aggregator); + aggregator = __get_first_agg(port); + ad_agg_selection_logic(aggregator); +} + +/* + * Decide if "agg" is a better choice for the new active aggregator that + * the current best, according to the ad_select policy. + */ +static struct aggregator *ad_agg_selection_test(struct aggregator *best, + struct aggregator *curr) +{ + /* + * 0. If no best, select current. + * + * 1. If the current agg is not individual, and the best is + * individual, select current. + * + * 2. If current agg is individual and the best is not, keep best. + * + * 3. Therefore, current and best are both individual or both not + * individual, so: + * + * 3a. If current agg partner replied, and best agg partner did not, + * select current. + * + * 3b. If current agg partner did not reply and best agg partner + * did reply, keep best. + * + * 4. Therefore, current and best both have partner replies or + * both do not, so perform selection policy: + * + * BOND_AD_COUNT: Select by count of ports. If count is equal, + * select by bandwidth. + * + * BOND_AD_STABLE, BOND_AD_BANDWIDTH: Select by bandwidth. + */ + if (!best) + return curr; + + if (!curr->is_individual && best->is_individual) + return curr; + + if (curr->is_individual && !best->is_individual) + return best; + + if (__agg_has_partner(curr) && !__agg_has_partner(best)) + return curr; + + if (!__agg_has_partner(curr) && __agg_has_partner(best)) + return best; + + switch (__get_agg_selection_mode(curr->lag_ports)) { + case BOND_AD_COUNT: + if (curr->num_of_ports > best->num_of_ports) + return curr; + + if (curr->num_of_ports < best->num_of_ports) + return best; + + /*FALLTHROUGH*/ + case BOND_AD_STABLE: + case BOND_AD_BANDWIDTH: + if (__get_agg_bandwidth(curr) > __get_agg_bandwidth(best)) + return curr; + + break; + + default: + printk(KERN_WARNING DRV_NAME + ": %s: Impossible agg select mode %d\n", + curr->slave->dev->master->name, + __get_agg_selection_mode(curr->lag_ports)); + break; } + + return best; } /** @@ -1424,156 +1509,138 @@ static void ad_port_selection_logic(struct port *port) * @aggregator: the aggregator we're looking at * * It is assumed that only one aggregator may be selected for a team. - * The logic of this function is to select (at first time) the aggregator with - * the most ports attached to it, and to reselect the active aggregator only if - * the previous aggregator has no more ports related to it. + * + * The logic of this function is to select the aggregator according to + * the ad_select policy: + * + * BOND_AD_STABLE: select the aggregator with the most ports attached to + * it, and to reselect the active aggregator only if the previous + * aggregator has no more ports related to it. + * + * BOND_AD_BANDWIDTH: select the aggregator with the highest total + * bandwidth, and reselect whenever a link state change takes place or the + * set of slaves in the bond changes. + * + * BOND_AD_COUNT: select the aggregator with largest number of ports + * (slaves), and reselect whenever a link state change takes place or the + * set of slaves in the bond changes. * * FIXME: this function MUST be called with the first agg in the bond, or * __get_active_agg() won't work correctly. This function should be better * called with the bond itself, and retrieve the first agg from it. */ -static void ad_agg_selection_logic(struct aggregator *aggregator) +static void ad_agg_selection_logic(struct aggregator *agg) { - struct aggregator *best_aggregator = NULL, *active_aggregator = NULL; - struct aggregator *last_active_aggregator = NULL, *origin_aggregator; + struct aggregator *best, *active, *origin; struct port *port; - u16 num_of_aggs=0; - origin_aggregator = aggregator; + origin = agg; - //get current active aggregator - last_active_aggregator = __get_active_agg(aggregator); + active = __get_active_agg(agg); + best = active; - // search for the aggregator with the most ports attached to it. do { - // count how many candidate lag's we have - if (aggregator->lag_ports) { - num_of_aggs++; - } - if (aggregator->is_active && !aggregator->is_individual && // if current aggregator is the active aggregator - MAC_ADDRESS_COMPARE(&(aggregator->partner_system), &(null_mac_addr))) { // and partner answers to 802.3ad PDUs - if (aggregator->num_of_ports) { // if any ports attached to the current aggregator - best_aggregator=NULL; // disregard the best aggregator that was chosen by now - break; // stop the selection of other aggregator if there are any ports attached to this active aggregator - } else { // no ports attached to this active aggregator - aggregator->is_active = 0; // mark this aggregator as not active anymore + agg->is_active = 0; + + if (agg->num_of_ports) + best = ad_agg_selection_test(best, agg); + + } while ((agg = __get_next_agg(agg))); + + if (best && + __get_agg_selection_mode(best->lag_ports) == BOND_AD_STABLE) { + /* + * For the STABLE policy, don't replace the old active + * aggregator if it's still active (it has an answering + * partner) or if both the best and active don't have an + * answering partner. + */ + if (active && active->lag_ports && + active->lag_ports->is_enabled && + (__agg_has_partner(active) || + (!__agg_has_partner(active) && !__agg_has_partner(best)))) { + if (!(!active->actor_oper_aggregator_key && + best->actor_oper_aggregator_key)) { + best = NULL; + active->is_active = 1; } } - if (aggregator->num_of_ports) { // if any ports attached - if (best_aggregator) { // if there is a candidte aggregator - //The reasons for choosing new best aggregator: - // 1. if current agg is NOT individual and the best agg chosen so far is individual OR - // current and best aggs are both individual or both not individual, AND - // 2a. current agg partner reply but best agg partner do not reply OR - // 2b. current agg partner reply OR current agg partner do not reply AND best agg partner also do not reply AND - // current has more ports/bandwidth, or same amount of ports but current has faster ports, THEN - // current agg become best agg so far - - //if current agg is NOT individual and the best agg chosen so far is individual change best_aggregator - if (!aggregator->is_individual && best_aggregator->is_individual) { - best_aggregator=aggregator; - } - // current and best aggs are both individual or both not individual - else if ((aggregator->is_individual && best_aggregator->is_individual) || - (!aggregator->is_individual && !best_aggregator->is_individual)) { - // current and best aggs are both individual or both not individual AND - // current agg partner reply but best agg partner do not reply - if ((MAC_ADDRESS_COMPARE(&(aggregator->partner_system), &(null_mac_addr)) && - !MAC_ADDRESS_COMPARE(&(best_aggregator->partner_system), &(null_mac_addr)))) { - best_aggregator=aggregator; - } - // current agg partner reply OR current agg partner do not reply AND best agg partner also do not reply - else if (! (!MAC_ADDRESS_COMPARE(&(aggregator->partner_system), &(null_mac_addr)) && - MAC_ADDRESS_COMPARE(&(best_aggregator->partner_system), &(null_mac_addr)))) { - if ((__get_agg_selection_mode(aggregator->lag_ports) == AD_BANDWIDTH)&& - (__get_agg_bandwidth(aggregator) > __get_agg_bandwidth(best_aggregator))) { - best_aggregator=aggregator; - } else if (__get_agg_selection_mode(aggregator->lag_ports) == AD_COUNT) { - if (((aggregator->num_of_ports > best_aggregator->num_of_ports) && - (aggregator->actor_oper_aggregator_key & AD_SPEED_KEY_BITS))|| - ((aggregator->num_of_ports == best_aggregator->num_of_ports) && - ((u16)(aggregator->actor_oper_aggregator_key & AD_SPEED_KEY_BITS) > - (u16)(best_aggregator->actor_oper_aggregator_key & AD_SPEED_KEY_BITS)))) { - best_aggregator=aggregator; - } - } - } - } - } else { - best_aggregator=aggregator; - } - } - aggregator->is_active = 0; // mark all aggregators as not active anymore - } while ((aggregator = __get_next_agg(aggregator))); - - // if we have new aggregator selected, don't replace the old aggregator if it has an answering partner, - // or if both old aggregator and new aggregator don't have answering partner - if (best_aggregator) { - if (last_active_aggregator && last_active_aggregator->lag_ports && last_active_aggregator->lag_ports->is_enabled && - (MAC_ADDRESS_COMPARE(&(last_active_aggregator->partner_system), &(null_mac_addr)) || // partner answers OR - (!MAC_ADDRESS_COMPARE(&(last_active_aggregator->partner_system), &(null_mac_addr)) && // both old and new - !MAC_ADDRESS_COMPARE(&(best_aggregator->partner_system), &(null_mac_addr)))) // partner do not answer - ) { - // if new aggregator has link, and old aggregator does not, replace old aggregator.(do nothing) - // -> don't replace otherwise. - if (!(!last_active_aggregator->actor_oper_aggregator_key && best_aggregator->actor_oper_aggregator_key)) { - best_aggregator=NULL; - last_active_aggregator->is_active = 1; // don't replace good old aggregator + } - } - } + if (best && (best == active)) { + best = NULL; + active->is_active = 1; } // if there is new best aggregator, activate it - if (best_aggregator) { - for (aggregator = __get_first_agg(best_aggregator->lag_ports); - aggregator; - aggregator = __get_next_agg(aggregator)) { - - dprintk("Agg=%d; Ports=%d; a key=%d; p key=%d; Indiv=%d; Active=%d\n", - aggregator->aggregator_identifier, aggregator->num_of_ports, - aggregator->actor_oper_aggregator_key, aggregator->partner_oper_aggregator_key, - aggregator->is_individual, aggregator->is_active); + if (best) { + dprintk("best Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", + best->aggregator_identifier, best->num_of_ports, + best->actor_oper_aggregator_key, + best->partner_oper_aggregator_key, + best->is_individual, best->is_active); + dprintk("best ports %p slave %p %s\n", + best->lag_ports, best->slave, + best->slave ? best->slave->dev->name : "NULL"); + + for (agg = __get_first_agg(best->lag_ports); agg; + agg = __get_next_agg(agg)) { + + dprintk("Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", + agg->aggregator_identifier, agg->num_of_ports, + agg->actor_oper_aggregator_key, + agg->partner_oper_aggregator_key, + agg->is_individual, agg->is_active); } // check if any partner replys - if (best_aggregator->is_individual) { - printk(KERN_WARNING DRV_NAME ": %s: Warning: No 802.3ad response from " - "the link partner for any adapters in the bond\n", - best_aggregator->slave->dev->master->name); - } - - // check if there are more than one aggregator - if (num_of_aggs > 1) { - dprintk("Warning: More than one Link Aggregation Group was " - "found in the bond. Only one group will function in the bond\n"); + if (best->is_individual) { + printk(KERN_WARNING DRV_NAME ": %s: Warning: No 802.3ad" + " response from the link partner for any" + " adapters in the bond\n", + best->slave->dev->master->name); } - best_aggregator->is_active = 1; - dprintk("LAG %d choosed as the active LAG\n", best_aggregator->aggregator_identifier); - dprintk("Agg=%d; Ports=%d; a key=%d; p key=%d; Indiv=%d; Active=%d\n", - best_aggregator->aggregator_identifier, best_aggregator->num_of_ports, - best_aggregator->actor_oper_aggregator_key, best_aggregator->partner_oper_aggregator_key, - best_aggregator->is_individual, best_aggregator->is_active); + best->is_active = 1; + dprintk("LAG %d chosen as the active LAG\n", + best->aggregator_identifier); + dprintk("Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", + best->aggregator_identifier, best->num_of_ports, + best->actor_oper_aggregator_key, + best->partner_oper_aggregator_key, + best->is_individual, best->is_active); // disable the ports that were related to the former active_aggregator - if (last_active_aggregator) { - for (port=last_active_aggregator->lag_ports; port; port=port->next_port_in_aggregator) { + if (active) { + for (port = active->lag_ports; port; + port = port->next_port_in_aggregator) { __disable_port(port); } } } - // if the selected aggregator is of join individuals(partner_system is NULL), enable their ports - active_aggregator = __get_active_agg(origin_aggregator); + /* + * if the selected aggregator is of join individuals + * (partner_system is NULL), enable their ports + */ + active = __get_active_agg(origin); - if (active_aggregator) { - if (!MAC_ADDRESS_COMPARE(&(active_aggregator->partner_system), &(null_mac_addr))) { - for (port=active_aggregator->lag_ports; port; port=port->next_port_in_aggregator) { + if (active) { + if (!__agg_has_partner(active)) { + for (port = active->lag_ports; port; + port = port->next_port_in_aggregator) { __enable_port(port); } } } + + if (origin->slave) { + struct bonding *bond; + + bond = bond_get_bond_by_slave(origin->slave); + if (bond) + bond_3ad_set_carrier(bond); + } } /** @@ -1830,6 +1897,19 @@ static void ad_initialize_lacpdu(struct lacpdu *lacpdu) // Check aggregators status in team every T seconds #define AD_AGGREGATOR_SELECTION_TIMER 8 +/* + * bond_3ad_initiate_agg_selection(struct bonding *bond) + * + * Set the aggregation selection timer, to initiate an agg selection in + * the very near future. Called during first initialization, and during + * any down to up transitions of the bond. + */ +void bond_3ad_initiate_agg_selection(struct bonding *bond, int timeout) +{ + BOND_AD_INFO(bond).agg_select_timer = timeout; + BOND_AD_INFO(bond).agg_select_mode = bond->params.ad_select; +} + static u16 aggregator_identifier; /** @@ -1854,9 +1934,9 @@ void bond_3ad_initialize(struct bonding *bond, u16 tick_resolution, int lacp_fas // initialize how many times this module is called in one second(should be about every 100ms) ad_ticks_per_sec = tick_resolution; - // initialize the aggregator selection timer(to activate an aggregation selection after initialize) - BOND_AD_INFO(bond).agg_select_timer = (AD_AGGREGATOR_SELECTION_TIMER * ad_ticks_per_sec); - BOND_AD_INFO(bond).agg_select_mode = AD_BANDWIDTH; + bond_3ad_initiate_agg_selection(bond, + AD_AGGREGATOR_SELECTION_TIMER * + ad_ticks_per_sec); } } diff --git a/drivers/net/bonding/bond_3ad.h b/drivers/net/bonding/bond_3ad.h index b5ee45f6d55..a803fe05f63 100644 --- a/drivers/net/bonding/bond_3ad.h +++ b/drivers/net/bonding/bond_3ad.h @@ -42,10 +42,11 @@ typedef struct mac_addr { u8 mac_addr_value[ETH_ALEN]; } mac_addr_t; -typedef enum { - AD_BANDWIDTH = 0, - AD_COUNT -} agg_selection_t; +enum { + BOND_AD_STABLE = 0, + BOND_AD_BANDWIDTH = 1, + BOND_AD_COUNT = 2, +}; // rx machine states(43.4.11 in the 802.3ad standard) typedef enum { @@ -277,6 +278,7 @@ void bond_3ad_initialize(struct bonding *bond, u16 tick_resolution, int lacp_fas int bond_3ad_bind_slave(struct slave *slave); void bond_3ad_unbind_slave(struct slave *slave); void bond_3ad_state_machine_handler(struct work_struct *); +void bond_3ad_initiate_agg_selection(struct bonding *bond, int timeout); void bond_3ad_adapter_speed_changed(struct slave *slave); void bond_3ad_adapter_duplex_changed(struct slave *slave); void bond_3ad_handle_link_change(struct slave *slave, char link); diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 798d98ce2d9..02de3e03123 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -97,6 +97,7 @@ static int use_carrier = 1; static char *mode = NULL; static char *primary = NULL; static char *lacp_rate = NULL; +static char *ad_select = NULL; static char *xmit_hash_policy = NULL; static int arp_interval = BOND_LINK_ARP_INTERV; static char *arp_ip_target[BOND_MAX_ARP_TARGETS] = { NULL, }; @@ -130,6 +131,8 @@ MODULE_PARM_DESC(primary, "Primary network device to use"); module_param(lacp_rate, charp, 0); MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner " "(slow/fast)"); +module_param(ad_select, charp, 0); +MODULE_PARM_DESC(ad_select, "803.ad aggregation selection logic: stable (0, default), bandwidth (1), count (2)"); module_param(xmit_hash_policy, charp, 0); MODULE_PARM_DESC(xmit_hash_policy, "XOR hashing method: 0 for layer 2 (default)" ", 1 for layer 3+4"); @@ -200,6 +203,13 @@ struct bond_parm_tbl fail_over_mac_tbl[] = { { NULL, -1}, }; +struct bond_parm_tbl ad_select_tbl[] = { +{ "stable", BOND_AD_STABLE}, +{ "bandwidth", BOND_AD_BANDWIDTH}, +{ "count", BOND_AD_COUNT}, +{ NULL, -1}, +}; + /*-------------------------- Forward declarations ---------------------------*/ static void bond_send_gratuitous_arp(struct bonding *bond); @@ -3318,6 +3328,8 @@ static void bond_info_show_master(struct seq_file *seq) seq_puts(seq, "\n802.3ad info\n"); seq_printf(seq, "LACP rate: %s\n", (bond->params.lacp_fast) ? "fast" : "slow"); + seq_printf(seq, "Aggregator selection policy (ad_select): %s\n", + ad_select_tbl[bond->params.ad_select].modename); if (bond_3ad_get_active_agg_info(bond, &ad_info)) { seq_printf(seq, "bond %s has no active aggregator\n", @@ -3824,6 +3836,7 @@ static int bond_open(struct net_device *bond_dev) queue_delayed_work(bond->wq, &bond->ad_work, 0); /* register to receive LACPDUs */ bond_register_lacpdu(bond); + bond_3ad_initiate_agg_selection(bond, 1); } return 0; @@ -4763,6 +4776,23 @@ static int bond_check_params(struct bond_params *params) } } + if (ad_select) { + params->ad_select = bond_parse_parm(ad_select, ad_select_tbl); + if (params->ad_select == -1) { + printk(KERN_ERR DRV_NAME + ": Error: Invalid ad_select \"%s\"\n", + ad_select == NULL ? "NULL" : ad_select); + return -EINVAL; + } + + if (bond_mode != BOND_MODE_8023AD) { + printk(KERN_WARNING DRV_NAME + ": ad_select param only affects 802.3ad mode\n"); + } + } else { + params->ad_select = BOND_AD_STABLE; + } + if (max_bonds < 0 || max_bonds > INT_MAX) { printk(KERN_WARNING DRV_NAME ": Warning: max_bonds (%d) not in range %d-%d, so it " diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index 8788e3e3385..aaf2927b5c3 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -48,6 +48,7 @@ extern struct list_head bond_dev_list; extern struct bond_params bonding_defaults; extern struct bond_parm_tbl bond_mode_tbl[]; extern struct bond_parm_tbl bond_lacp_tbl[]; +extern struct bond_parm_tbl ad_select_tbl[]; extern struct bond_parm_tbl xmit_hashtype_tbl[]; extern struct bond_parm_tbl arp_validate_tbl[]; extern struct bond_parm_tbl fail_over_mac_tbl[]; @@ -944,6 +945,53 @@ out: } static DEVICE_ATTR(lacp_rate, S_IRUGO | S_IWUSR, bonding_show_lacp, bonding_store_lacp); +static ssize_t bonding_show_ad_select(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct bonding *bond = to_bond(d); + + return sprintf(buf, "%s %d\n", + ad_select_tbl[bond->params.ad_select].modename, + bond->params.ad_select); +} + + +static ssize_t bonding_store_ad_select(struct device *d, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int new_value, ret = count; + struct bonding *bond = to_bond(d); + + if (bond->dev->flags & IFF_UP) { + printk(KERN_ERR DRV_NAME + ": %s: Unable to update ad_select because interface " + "is up.\n", bond->dev->name); + ret = -EPERM; + goto out; + } + + new_value = bond_parse_parm(buf, ad_select_tbl); + + if (new_value != -1) { + bond->params.ad_select = new_value; + printk(KERN_INFO DRV_NAME + ": %s: Setting ad_select to %s (%d).\n", + bond->dev->name, ad_select_tbl[new_value].modename, + new_value); + } else { + printk(KERN_ERR DRV_NAME + ": %s: Ignoring invalid ad_select value %.*s.\n", + bond->dev->name, (int)strlen(buf) - 1, buf); + ret = -EINVAL; + } +out: + return ret; +} + +static DEVICE_ATTR(ad_select, S_IRUGO | S_IWUSR, bonding_show_ad_select, bonding_store_ad_select); + /* * Show and set the number of grat ARP to send after a failover event. */ @@ -1459,6 +1507,7 @@ static struct attribute *per_bond_attrs[] = { &dev_attr_downdelay.attr, &dev_attr_updelay.attr, &dev_attr_lacp_rate.attr, + &dev_attr_ad_select.attr, &dev_attr_xmit_hash_policy.attr, &dev_attr_num_grat_arp.attr, &dev_attr_num_unsol_na.attr, diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index 0491c7c2645..b5eb8e65b30 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h @@ -23,8 +23,8 @@ #include "bond_3ad.h" #include "bond_alb.h" -#define DRV_VERSION "3.4.0" -#define DRV_RELDATE "October 7, 2008" +#define DRV_VERSION "3.5.0" +#define DRV_RELDATE "November 4, 2008" #define DRV_NAME "bonding" #define DRV_DESCRIPTION "Ethernet Channel Bonding Driver" @@ -137,6 +137,7 @@ struct bond_params { int updelay; int downdelay; int lacp_fast; + int ad_select; char primary[IFNAMSIZ]; __be32 arp_targets[BOND_MAX_ARP_TARGETS]; }; -- cgit v1.2.3-70-g09d2 From d90ebcbfa7f5a8b4e20518c9f94c5c4e4cd3c2e5 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Wed, 12 Nov 2008 00:47:26 -0800 Subject: dccp: Query supported CCIDs This provides a data structure to record which CCIDs are locally supported and three accessor functions: - a test function for internal use which is used to validate CCID requests made by the user; - a copy function so that the list can be used for feature-negotiation; - documented getsockopt() support so that the user can query capabilities. The data structure is a table which is filled in at compile-time with the list of available CCIDs (which in turn depends on the Kconfig choices). Using the copy function for cloning the list of supported CCIDs is useful for feature negotiation, since the negotiation is now with the full list of available CCIDs (e.g. {2, 3}) instead of the default value {2}. This means negotiation will not fail if the peer requests to use CCID3 instead of CCID2. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: David S. Miller --- Documentation/networking/dccp.txt | 4 ++++ include/linux/dccp.h | 1 + net/dccp/ccid.c | 48 +++++++++++++++++++++++++++++++++++++++ net/dccp/ccid.h | 5 ++++ net/dccp/feat.c | 4 ++++ net/dccp/proto.c | 2 ++ 6 files changed, 64 insertions(+) (limited to 'Documentation/networking') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 39131a3c78f..f0aeb20fa63 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -57,6 +57,10 @@ can be set before calling bind(). DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet size (application payload size) in bytes, see RFC 4340, section 14. +DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs +supported by the endpoint (see include/linux/dccp.h for symbolic constants). +The caller needs to provide a sufficiently large (> 2) array of type uint8_t. + DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold timewait state when closing the connection (RFC 4340, 8.3). The usual case is that the closing server sends a CloseReq, whereupon the client holds timewait diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 484b8a1fb02..d3ac1bde60b 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -209,6 +209,7 @@ struct dccp_so_feat { #define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 #define DCCP_SOCKOPT_SEND_CSCOV 10 #define DCCP_SOCKOPT_RECV_CSCOV 11 +#define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 #define DCCP_SOCKOPT_CCID_RX_INFO 128 #define DCCP_SOCKOPT_CCID_TX_INFO 192 diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 8fe931a3d7a..647cb0614f8 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -13,6 +13,13 @@ #include "ccid.h" +static u8 builtin_ccids[] = { + DCCPC_CCID2, /* CCID2 is supported by default */ +#if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE) + DCCPC_CCID3, +#endif +}; + static struct ccid_operations *ccids[CCID_MAX]; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) static atomic_t ccids_lockct = ATOMIC_INIT(0); @@ -86,6 +93,47 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab) } } +/* check that up to @array_len members in @ccid_array are supported */ +bool ccid_support_check(u8 const *ccid_array, u8 array_len) +{ + u8 i, j, found; + + for (i = 0, found = 0; i < array_len; i++, found = 0) { + for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++) + found = (ccid_array[i] == builtin_ccids[j]); + if (!found) + return false; + } + return true; +} + +/** + * ccid_get_builtin_ccids - Provide copy of `builtin' CCID array + * @ccid_array: pointer to copy into + * @array_len: value to return length into + * This function allocates memory - caller must see that it is freed after use. + */ +int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) +{ + *ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any()); + if (*ccid_array == NULL) + return -ENOBUFS; + *array_len = ARRAY_SIZE(builtin_ccids); + return 0; +} + +int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + if (len < sizeof(builtin_ccids)) + return -EINVAL; + + if (put_user(sizeof(builtin_ccids), optlen) || + copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids))) + return -EFAULT; + return 0; +} + int ccid_register(struct ccid_operations *ccid_ops) { int err = -ENOBUFS; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index fdeae7b5731..259f5469d7d 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -103,6 +103,11 @@ static inline void *ccid_priv(const struct ccid *ccid) return (void *)ccid->ccid_priv; } +extern bool ccid_support_check(u8 const *ccid_array, u8 array_len); +extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); +extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, + char __user *, int __user *); + extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 192d494a381..f79fb5e33f5 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -342,6 +342,10 @@ static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) return -EINVAL; + /* Avoid negotiating alien CCIDs by only advertising supported ones */ + if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) + return -EOPNOTSUPP; + if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) return -ENOMEM; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 01332fe7a99..b4b10cbd888 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -649,6 +649,8 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_GET_CUR_MPS: val = dp->dccps_mss_cache; break; + case DCCP_SOCKOPT_AVAILABLE_CCIDS: + return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); case DCCP_SOCKOPT_SERVER_TIMEWAIT: val = dp->dccps_server_timewait; break; -- cgit v1.2.3-70-g09d2 From dd9c0e363cef32b7d6f23d4c87e8dfe4f91fd1c5 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 16 Nov 2008 22:55:08 -0800 Subject: dccp: Deprecate Ack Ratio sysctl This patch deprecates the Ack Ratio sysctl, since * Ack Ratio is entirely ignored by CCID-3 and CCID-4, * Ack Ratio currently doesn't work in CCID-2 (i.e. is always set to 1); * even if it would work in CCID-2, there is no point for a user to change it: - Ack Ratio is constrained by cwnd (RFC 4341, 6.1.2), - if Ack Ratio > cwnd, the system resorts to spurious RTO timeouts (since waiting for Acks which will never arrive in this window), - cwnd is not a user-configurable value. The only reasonable place for Ack Ratio is to print it for debugging. It is planned to do this later on, as part of e.g. dccp_probe. With this patch Ack Ratio is now under full control of feature negotiation: * Ack Ratio is resolved as a dependency of the selected CCID; * if the chosen CCID supports it (i.e. CCID == CCID-2), Ack Ratio is set to the default of 2, following RFC 4340, 11.3 - "New connections start with Ack Ratio 2 for both endpoints"; * what happens then is part of another patch set, since it concerns the dynamic update of Ack Ratio while the connection is in full flight. Thanks to Tomasz Grobelny for discussion leading up to this patch. Signed-off-by: Gerrit Renker Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- Documentation/networking/dccp.txt | 3 --- include/linux/dccp.h | 2 -- net/dccp/dccp.h | 1 - net/dccp/minisocks.c | 1 - net/dccp/options.c | 1 - net/dccp/sysctl.c | 7 ------- 6 files changed, 15 deletions(-) (limited to 'Documentation/networking') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index f0aeb20fa63..43df4487379 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -125,9 +125,6 @@ send_ndp = 1 send_ackvec = 1 Whether or not to send Ack Vector options (sec. 11.5). -ack_ratio = 2 - The default Ack Ratio (sec. 11.3) to use. - tx_ccid = 2 Default CCID for the sender-receiver half-connection. diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 5a5a89935db..eda389ce04f 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -368,7 +368,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * @dccpms_ccid - Congestion Control Id (CCID) (section 10) * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) - * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3) * @dccpms_pending - List of features being negotiated * @dccpms_conf - */ @@ -378,7 +377,6 @@ struct dccp_minisock { __u8 dccpms_tx_ccid; __u8 dccpms_send_ack_vector; __u8 dccpms_send_ndp_count; - __u8 dccpms_ack_ratio; struct list_head dccpms_pending; struct list_head dccpms_conf; }; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index e656dafb5d9..031ce350d3c 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -98,7 +98,6 @@ extern int sysctl_dccp_retries2; extern int sysctl_dccp_feat_sequence_window; extern int sysctl_dccp_feat_rx_ccid; extern int sysctl_dccp_feat_tx_ccid; -extern int sysctl_dccp_feat_ack_ratio; extern int sysctl_dccp_feat_send_ack_vector; extern int sysctl_dccp_feat_send_ndp_count; extern int sysctl_dccp_tx_qlen; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index afdacbb94d7..ed61bc58e41 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -47,7 +47,6 @@ void dccp_minisock_init(struct dccp_minisock *dmsk) dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid; dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid; - dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio; dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; } diff --git a/net/dccp/options.c b/net/dccp/options.c index 67a171a1268..515ad45013a 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -26,7 +26,6 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; -int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO; int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 21295993fdb..f6e54f433e2 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -40,13 +40,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "ack_ratio", - .data = &sysctl_dccp_feat_ack_ratio, - .maxlen = sizeof(sysctl_dccp_feat_ack_ratio), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "send_ackvec", .data = &sysctl_dccp_feat_send_ack_vector, -- cgit v1.2.3-70-g09d2 From f9f88fed3433139b58962011c81597b44fd48458 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sun, 9 Nov 2008 18:38:51 +0200 Subject: mac80211_hwsim: Update documentation (AP mode enabled) AP mode is now enabled in mac80211, so there is no need to point users to an additional patch to enable the mode. In addition, add a pointer to more hwsim test cases in hostap.git. Signed-off-by: Jouni Malinen Signed-off-by: John W. Linville --- Documentation/networking/mac80211_hwsim/README | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'Documentation/networking') diff --git a/Documentation/networking/mac80211_hwsim/README b/Documentation/networking/mac80211_hwsim/README index 2ff8ccb8dc3..24ac91d5669 100644 --- a/Documentation/networking/mac80211_hwsim/README +++ b/Documentation/networking/mac80211_hwsim/README @@ -50,10 +50,6 @@ associates with the AP. hostapd and wpa_supplicant are used to take care of WPA2-PSK authentication. In addition, hostapd is also processing access point side of association. -Please note that the current Linux kernel does not enable AP mode, so a -simple patch is needed to enable AP mode selection: -http://johannes.sipsolutions.net/patches/kernel/all/LATEST/006-allow-ap-vlan-modes.patch - # Build mac80211_hwsim as part of kernel configuration @@ -65,3 +61,8 @@ hostapd hostapd.conf # Run wpa_supplicant (station) for wlan1 wpa_supplicant -Dwext -iwlan1 -c wpa_supplicant.conf + + +More test cases are available in hostap.git: +git://w1.fi/srv/git/hostap.git and mac80211_hwsim/tests subdirectory +(http://w1.fi/gitweb/gitweb.cgi?p=hostap.git;a=tree;f=mac80211_hwsim/tests) -- cgit v1.2.3-70-g09d2 From e022c2f07ae52bfbd92faa273db0db2f34eb28e8 Mon Sep 17 00:00:00 2001 From: Krzysztof Hałasa Date: Thu, 14 Aug 2008 19:17:38 +0200 Subject: WAN: new synchronous PPP implementation for generic HDLC. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krzysztof Hałasa --- Documentation/networking/generic-hdlc.txt | 8 +- drivers/net/wan/Makefile | 2 +- drivers/net/wan/hdlc_ppp.c | 648 +++++++++++++++++++++++++++--- 3 files changed, 606 insertions(+), 52 deletions(-) (limited to 'Documentation/networking') diff --git a/Documentation/networking/generic-hdlc.txt b/Documentation/networking/generic-hdlc.txt index 31bc8b759b7..4eb3cc40b70 100644 --- a/Documentation/networking/generic-hdlc.txt +++ b/Documentation/networking/generic-hdlc.txt @@ -3,15 +3,15 @@ Krzysztof Halasa Generic HDLC layer currently supports: -1. Frame Relay (ANSI, CCITT, Cisco and no LMI). +1. Frame Relay (ANSI, CCITT, Cisco and no LMI) - Normal (routed) and Ethernet-bridged (Ethernet device emulation) interfaces can share a single PVC. - ARP support (no InARP support in the kernel - there is an experimental InARP user-space daemon available on: http://www.kernel.org/pub/linux/utils/net/hdlc/). -2. raw HDLC - either IP (IPv4) interface or Ethernet device emulation. -3. Cisco HDLC. -4. PPP (uses syncppp.c). +2. raw HDLC - either IP (IPv4) interface or Ethernet device emulation +3. Cisco HDLC +4. PPP 5. X.25 (uses X.25 routines). Generic HDLC is a protocol driver only - it needs a low-level driver diff --git a/drivers/net/wan/Makefile b/drivers/net/wan/Makefile index 102549605d0..cec16818a13 100644 --- a/drivers/net/wan/Makefile +++ b/drivers/net/wan/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_HDLC_RAW) += hdlc_raw.o obj-$(CONFIG_HDLC_RAW_ETH) += hdlc_raw_eth.o obj-$(CONFIG_HDLC_CISCO) += hdlc_cisco.o obj-$(CONFIG_HDLC_FR) += hdlc_fr.o -obj-$(CONFIG_HDLC_PPP) += hdlc_ppp.o syncppp.o +obj-$(CONFIG_HDLC_PPP) += hdlc_ppp.o obj-$(CONFIG_HDLC_X25) += hdlc_x25.o pc300-y := pc300_drv.o diff --git a/drivers/net/wan/hdlc_ppp.c b/drivers/net/wan/hdlc_ppp.c index 4efe9e6d32d..72fae217f1c 100644 --- a/drivers/net/wan/hdlc_ppp.c +++ b/drivers/net/wan/hdlc_ppp.c @@ -2,7 +2,7 @@ * Generic HDLC support routines for Linux * Point-to-point protocol support * - * Copyright (C) 1999 - 2006 Krzysztof Halasa + * Copyright (C) 1999 - 2008 Krzysztof Halasa * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License @@ -18,87 +18,632 @@ #include #include #include -#include #include #include -#include +#include + +#define DEBUG_CP 0 /* also bytes# to dump */ +#define DEBUG_STATE 0 +#define DEBUG_HARD_HEADER 0 + +#define HDLC_ADDR_ALLSTATIONS 0xFF +#define HDLC_CTRL_UI 0x03 + +#define PID_LCP 0xC021 +#define PID_IP 0x0021 +#define PID_IPCP 0x8021 +#define PID_IPV6 0x0057 +#define PID_IPV6CP 0x8057 + +enum {IDX_LCP = 0, IDX_IPCP, IDX_IPV6CP, IDX_COUNT}; +enum {CP_CONF_REQ = 1, CP_CONF_ACK, CP_CONF_NAK, CP_CONF_REJ, CP_TERM_REQ, + CP_TERM_ACK, CP_CODE_REJ, LCP_PROTO_REJ, LCP_ECHO_REQ, LCP_ECHO_REPLY, + LCP_DISC_REQ, CP_CODES}; +#if DEBUG_CP +static const char *const code_names[CP_CODES] = { + "0", "ConfReq", "ConfAck", "ConfNak", "ConfRej", "TermReq", + "TermAck", "CodeRej", "ProtoRej", "EchoReq", "EchoReply", "Discard" +}; +static char debug_buffer[64 + 3 * DEBUG_CP]; +#endif + +enum {LCP_OPTION_MRU = 1, LCP_OPTION_ACCM, LCP_OPTION_MAGIC = 5}; + +struct hdlc_header { + u8 address; + u8 control; + __be16 protocol; +}; + +struct cp_header { + u8 code; + u8 id; + __be16 len; +}; + -struct ppp_state { - struct ppp_device pppdev; - struct ppp_device *syncppp_ptr; - int (*old_change_mtu)(struct net_device *dev, int new_mtu); +struct proto { + struct net_device *dev; + struct timer_list timer; + unsigned long timeout; + u16 pid; /* protocol ID */ + u8 state; + u8 cr_id; /* ID of last Configuration-Request */ + u8 restart_counter; }; +struct ppp { + struct proto protos[IDX_COUNT]; + spinlock_t lock; + unsigned long last_pong; + unsigned int req_timeout, cr_retries, term_retries; + unsigned int keepalive_interval, keepalive_timeout; + u8 seq; /* local sequence number for requests */ + u8 echo_id; /* ID of last Echo-Request (LCP) */ +}; + +enum {CLOSED = 0, STOPPED, STOPPING, REQ_SENT, ACK_RECV, ACK_SENT, OPENED, + STATES, STATE_MASK = 0xF}; +enum {START = 0, STOP, TO_GOOD, TO_BAD, RCR_GOOD, RCR_BAD, RCA, RCN, RTR, RTA, + RUC, RXJ_GOOD, RXJ_BAD, EVENTS}; +enum {INV = 0x10, IRC = 0x20, ZRC = 0x40, SCR = 0x80, SCA = 0x100, + SCN = 0x200, STR = 0x400, STA = 0x800, SCJ = 0x1000}; + +#if DEBUG_STATE +static const char *const state_names[STATES] = { + "Closed", "Stopped", "Stopping", "ReqSent", "AckRecv", "AckSent", + "Opened" +}; +static const char *const event_names[EVENTS] = { + "Start", "Stop", "TO+", "TO-", "RCR+", "RCR-", "RCA", "RCN", + "RTR", "RTA", "RUC", "RXJ+", "RXJ-" +}; +#endif + +static struct sk_buff_head tx_queue; /* used when holding the spin lock */ + static int ppp_ioctl(struct net_device *dev, struct ifreq *ifr); +static inline struct ppp* get_ppp(struct net_device *dev) +{ + return (struct ppp *)dev_to_hdlc(dev)->state; +} -static inline struct ppp_state* state(hdlc_device *hdlc) +static inline struct proto* get_proto(struct net_device *dev, u16 pid) { - return(struct ppp_state *)(hdlc->state); + struct ppp *ppp = get_ppp(dev); + + switch (pid) { + case PID_LCP: + return &ppp->protos[IDX_LCP]; + case PID_IPCP: + return &ppp->protos[IDX_IPCP]; + case PID_IPV6CP: + return &ppp->protos[IDX_IPV6CP]; + default: + return NULL; + } } +static inline const char* proto_name(u16 pid) +{ + switch (pid) { + case PID_LCP: + return "LCP"; + case PID_IPCP: + return "IPCP"; + case PID_IPV6CP: + return "IPV6CP"; + default: + return NULL; + } +} -static int ppp_open(struct net_device *dev) +static __be16 ppp_type_trans(struct sk_buff *skb, struct net_device *dev) { - hdlc_device *hdlc = dev_to_hdlc(dev); - int (*old_ioctl)(struct net_device *, struct ifreq *, int); - int result; + struct hdlc_header *data = (struct hdlc_header*)skb->data; + + if (skb->len < sizeof(struct hdlc_header)) + return htons(ETH_P_HDLC); + if (data->address != HDLC_ADDR_ALLSTATIONS || + data->control != HDLC_CTRL_UI) + return htons(ETH_P_HDLC); + + switch (data->protocol) { + case __constant_htons(PID_IP): + skb_pull(skb, sizeof(struct hdlc_header)); + return htons(ETH_P_IP); - dev->ml_priv = &state(hdlc)->syncppp_ptr; - state(hdlc)->syncppp_ptr = &state(hdlc)->pppdev; - state(hdlc)->pppdev.dev = dev; + case __constant_htons(PID_IPV6): + skb_pull(skb, sizeof(struct hdlc_header)); + return htons(ETH_P_IPV6); - old_ioctl = dev->do_ioctl; - state(hdlc)->old_change_mtu = dev->change_mtu; - sppp_attach(&state(hdlc)->pppdev); - /* sppp_attach nukes them. We don't need syncppp's ioctl */ - dev->do_ioctl = old_ioctl; - state(hdlc)->pppdev.sppp.pp_flags &= ~PP_CISCO; - dev->type = ARPHRD_PPP; - result = sppp_open(dev); - if (result) { - sppp_detach(dev); - return result; + default: + return htons(ETH_P_HDLC); } +} - return 0; + +static int ppp_hard_header(struct sk_buff *skb, struct net_device *dev, + u16 type, const void *daddr, const void *saddr, + unsigned int len) +{ + struct hdlc_header *data; +#if DEBUG_HARD_HEADER + printk(KERN_DEBUG "%s: ppp_hard_header() called\n", dev->name); +#endif + + skb_push(skb, sizeof(struct hdlc_header)); + data = (struct hdlc_header*)skb->data; + + data->address = HDLC_ADDR_ALLSTATIONS; + data->control = HDLC_CTRL_UI; + switch (type) { + case ETH_P_IP: + data->protocol = htons(PID_IP); + break; + case ETH_P_IPV6: + data->protocol = htons(PID_IPV6); + break; + case PID_LCP: + case PID_IPCP: + case PID_IPV6CP: + data->protocol = htons(type); + break; + default: /* unknown protocol */ + data->protocol = 0; + } + return sizeof(struct hdlc_header); } +static void ppp_tx_flush(void) +{ + struct sk_buff *skb; + while ((skb = skb_dequeue(&tx_queue)) != NULL) + dev_queue_xmit(skb); +} -static void ppp_close(struct net_device *dev) +static void ppp_tx_cp(struct net_device *dev, u16 pid, u8 code, + u8 id, unsigned int len, const void *data) { - hdlc_device *hdlc = dev_to_hdlc(dev); + struct sk_buff *skb; + struct cp_header *cp; + unsigned int magic_len = 0; + static u32 magic; + +#if DEBUG_CP + int i; + char *ptr; +#endif + + if (pid == PID_LCP && (code == LCP_ECHO_REQ || code == LCP_ECHO_REPLY)) + magic_len = sizeof(magic); + + skb = dev_alloc_skb(sizeof(struct hdlc_header) + + sizeof(struct cp_header) + magic_len + len); + if (!skb) { + printk(KERN_WARNING "%s: out of memory in ppp_tx_cp()\n", + dev->name); + return; + } + skb_reserve(skb, sizeof(struct hdlc_header)); + + cp = (struct cp_header *)skb_put(skb, sizeof(struct cp_header)); + cp->code = code; + cp->id = id; + cp->len = htons(sizeof(struct cp_header) + magic_len + len); + + if (magic_len) + memcpy(skb_put(skb, magic_len), &magic, magic_len); + if (len) + memcpy(skb_put(skb, len), data, len); + +#if DEBUG_CP + BUG_ON(code >= CP_CODES); + ptr = debug_buffer; + *ptr = '\x0'; + for (i = 0; i < min_t(unsigned int, magic_len + len, DEBUG_CP); i++) { + sprintf(ptr, " %02X", skb->data[sizeof(struct cp_header) + i]); + ptr += strlen(ptr); + } + printk(KERN_DEBUG "%s: TX %s [%s id 0x%X]%s\n", dev->name, + proto_name(pid), code_names[code], id, debug_buffer); +#endif - sppp_close(dev); - sppp_detach(dev); + ppp_hard_header(skb, dev, pid, NULL, NULL, 0); - dev->change_mtu = state(hdlc)->old_change_mtu; - dev->mtu = HDLC_MAX_MTU; - dev->hard_header_len = 16; + skb->priority = TC_PRIO_CONTROL; + skb->dev = dev; + skb_reset_network_header(skb); + skb_queue_tail(&tx_queue, skb); } +/* State transition table (compare STD-51) + Events Actions + TO+ = Timeout with counter > 0 irc = Initialize-Restart-Count + TO- = Timeout with counter expired zrc = Zero-Restart-Count + + RCR+ = Receive-Configure-Request (Good) scr = Send-Configure-Request + RCR- = Receive-Configure-Request (Bad) + RCA = Receive-Configure-Ack sca = Send-Configure-Ack + RCN = Receive-Configure-Nak/Rej scn = Send-Configure-Nak/Rej + + RTR = Receive-Terminate-Request str = Send-Terminate-Request + RTA = Receive-Terminate-Ack sta = Send-Terminate-Ack + + RUC = Receive-Unknown-Code scj = Send-Code-Reject + RXJ+ = Receive-Code-Reject (permitted) + or Receive-Protocol-Reject + RXJ- = Receive-Code-Reject (catastrophic) + or Receive-Protocol-Reject +*/ +static int cp_table[EVENTS][STATES] = { + /* CLOSED STOPPED STOPPING REQ_SENT ACK_RECV ACK_SENT OPENED + 0 1 2 3 4 5 6 */ + {IRC|SCR|3, INV , INV , INV , INV , INV , INV }, /* START */ + { INV , 0 , 0 , 0 , 0 , 0 , 0 }, /* STOP */ + { INV , INV ,STR|2, SCR|3 ,SCR|3, SCR|5 , INV }, /* TO+ */ + { INV , INV , 1 , 1 , 1 , 1 , INV }, /* TO- */ + { STA|0 ,IRC|SCR|SCA|5, 2 , SCA|5 ,SCA|6, SCA|5 ,SCR|SCA|5}, /* RCR+ */ + { STA|0 ,IRC|SCR|SCN|3, 2 , SCN|3 ,SCN|4, SCN|3 ,SCR|SCN|3}, /* RCR- */ + { STA|0 , STA|1 , 2 , IRC|4 ,SCR|3, 6 , SCR|3 }, /* RCA */ + { STA|0 , STA|1 , 2 ,IRC|SCR|3,SCR|3,IRC|SCR|5, SCR|3 }, /* RCN */ + { STA|0 , STA|1 ,STA|2, STA|3 ,STA|3, STA|3 ,ZRC|STA|2}, /* RTR */ + { 0 , 1 , 1 , 3 , 3 , 5 , SCR|3 }, /* RTA */ + { SCJ|0 , SCJ|1 ,SCJ|2, SCJ|3 ,SCJ|4, SCJ|5 , SCJ|6 }, /* RUC */ + { 0 , 1 , 2 , 3 , 3 , 5 , 6 }, /* RXJ+ */ + { 0 , 1 , 1 , 1 , 1 , 1 ,IRC|STR|2}, /* RXJ- */ +}; + -static __be16 ppp_type_trans(struct sk_buff *skb, struct net_device *dev) +/* SCA: RCR+ must supply id, len and data + SCN: RCR- must supply code, id, len and data + STA: RTR must supply id + SCJ: RUC must supply CP packet len and data */ +static void ppp_cp_event(struct net_device *dev, u16 pid, u16 event, u8 code, + u8 id, unsigned int len, void *data) { - return __constant_htons(ETH_P_WAN_PPP); + int old_state, action; + struct ppp *ppp = get_ppp(dev); + struct proto *proto = get_proto(dev, pid); + + old_state = proto->state; + BUG_ON(old_state >= STATES); + BUG_ON(event >= EVENTS); + +#if DEBUG_STATE + printk(KERN_DEBUG "%s: %s ppp_cp_event(%s) %s ...\n", dev->name, + proto_name(pid), event_names[event], state_names[proto->state]); +#endif + + action = cp_table[event][old_state]; + + proto->state = action & STATE_MASK; + if (action & (SCR | STR)) /* set Configure-Req/Terminate-Req timer */ + mod_timer(&proto->timer, proto->timeout = + jiffies + ppp->req_timeout * HZ); + if (action & ZRC) + proto->restart_counter = 0; + if (action & IRC) + proto->restart_counter = (proto->state == STOPPING) ? + ppp->term_retries : ppp->cr_retries; + + if (action & SCR) /* send Configure-Request */ + ppp_tx_cp(dev, pid, CP_CONF_REQ, proto->cr_id = ++ppp->seq, + 0, NULL); + if (action & SCA) /* send Configure-Ack */ + ppp_tx_cp(dev, pid, CP_CONF_ACK, id, len, data); + if (action & SCN) /* send Configure-Nak/Reject */ + ppp_tx_cp(dev, pid, code, id, len, data); + if (action & STR) /* send Terminate-Request */ + ppp_tx_cp(dev, pid, CP_TERM_REQ, ++ppp->seq, 0, NULL); + if (action & STA) /* send Terminate-Ack */ + ppp_tx_cp(dev, pid, CP_TERM_ACK, id, 0, NULL); + if (action & SCJ) /* send Code-Reject */ + ppp_tx_cp(dev, pid, CP_CODE_REJ, ++ppp->seq, len, data); + + if (old_state != OPENED && proto->state == OPENED) { + printk(KERN_INFO "%s: %s up\n", dev->name, proto_name(pid)); + if (pid == PID_LCP) { + netif_dormant_off(dev); + ppp_cp_event(dev, PID_IPCP, START, 0, 0, 0, NULL); + ppp_cp_event(dev, PID_IPV6CP, START, 0, 0, 0, NULL); + ppp->last_pong = jiffies; + mod_timer(&proto->timer, proto->timeout = + jiffies + ppp->keepalive_interval * HZ); + } + } + if (old_state == OPENED && proto->state != OPENED) { + printk(KERN_INFO "%s: %s down\n", dev->name, proto_name(pid)); + if (pid == PID_LCP) { + netif_dormant_on(dev); + ppp_cp_event(dev, PID_IPCP, STOP, 0, 0, 0, NULL); + ppp_cp_event(dev, PID_IPV6CP, STOP, 0, 0, 0, NULL); + } + } + if (old_state != CLOSED && proto->state == CLOSED) + del_timer(&proto->timer); + +#if DEBUG_STATE + printk(KERN_DEBUG "%s: %s ppp_cp_event(%s) ... %s\n", dev->name, + proto_name(pid), event_names[event], state_names[proto->state]); +#endif } +static void ppp_cp_parse_cr(struct net_device *dev, u16 pid, u8 id, + unsigned int len, u8 *data) +{ + static u8 const valid_accm[6] = { LCP_OPTION_ACCM, 6, 0, 0, 0, 0 }; + u8 *opt, *out; + unsigned int nak_len = 0, rej_len = 0; + + if (!(out = kmalloc(len, GFP_ATOMIC))) { + dev->stats.rx_dropped++; + return; /* out of memory, ignore CR packet */ + } + + for (opt = data; len; len -= opt[1], opt += opt[1]) { + if (len < 2 || len < opt[1]) { + dev->stats.rx_errors++; + return; /* bad packet, drop silently */ + } + + if (pid == PID_LCP) + switch (opt[0]) { + case LCP_OPTION_MRU: + continue; /* MRU always OK and > 1500 bytes? */ + + case LCP_OPTION_ACCM: /* async control character map */ + if (!memcmp(opt, valid_accm, + sizeof(valid_accm))) + continue; + if (!rej_len) { /* NAK it */ + memcpy(out + nak_len, valid_accm, + sizeof(valid_accm)); + nak_len += sizeof(valid_accm); + continue; + } + break; + case LCP_OPTION_MAGIC: + if (opt[1] != 6 || (!opt[2] && !opt[3] && + !opt[4] && !opt[5])) + break; /* reject invalid magic number */ + continue; + } + /* reject this option */ + memcpy(out + rej_len, opt, opt[1]); + rej_len += opt[1]; + } + + if (rej_len) + ppp_cp_event(dev, pid, RCR_BAD, CP_CONF_REJ, id, rej_len, out); + else if (nak_len) + ppp_cp_event(dev, pid, RCR_BAD, CP_CONF_NAK, id, nak_len, out); + else + ppp_cp_event(dev, pid, RCR_GOOD, CP_CONF_ACK, id, len, data); + + kfree(out); +} + +static int ppp_rx(struct sk_buff *skb) +{ + struct hdlc_header *hdr = (struct hdlc_header*)skb->data; + struct net_device *dev = skb->dev; + struct ppp *ppp = get_ppp(dev); + struct proto *proto; + struct cp_header *cp; + unsigned long flags; + unsigned int len; + u16 pid; +#if DEBUG_CP + int i; + char *ptr; +#endif + + spin_lock_irqsave(&ppp->lock, flags); + /* Check HDLC header */ + if (skb->len < sizeof(struct hdlc_header)) + goto rx_error; + cp = (struct cp_header*)skb_pull(skb, sizeof(struct hdlc_header)); + if (hdr->address != HDLC_ADDR_ALLSTATIONS || + hdr->control != HDLC_CTRL_UI) + goto rx_error; + + pid = ntohs(hdr->protocol); + proto = get_proto(dev, pid); + if (!proto) { + if (ppp->protos[IDX_LCP].state == OPENED) + ppp_tx_cp(dev, PID_LCP, LCP_PROTO_REJ, + ++ppp->seq, skb->len + 2, &hdr->protocol); + goto rx_error; + } + + len = ntohs(cp->len); + if (len < sizeof(struct cp_header) /* no complete CP header? */ || + skb->len < len /* truncated packet? */) + goto rx_error; + skb_pull(skb, sizeof(struct cp_header)); + len -= sizeof(struct cp_header); + + /* HDLC and CP headers stripped from skb */ +#if DEBUG_CP + if (cp->code < CP_CODES) + sprintf(debug_buffer, "[%s id 0x%X]", code_names[cp->code], + cp->id); + else + sprintf(debug_buffer, "[code %u id 0x%X]", cp->code, cp->id); + ptr = debug_buffer + strlen(debug_buffer); + for (i = 0; i < min_t(unsigned int, len, DEBUG_CP); i++) { + sprintf(ptr, " %02X", skb->data[i]); + ptr += strlen(ptr); + } + printk(KERN_DEBUG "%s: RX %s %s\n", dev->name, proto_name(pid), + debug_buffer); +#endif + + /* LCP only */ + if (pid == PID_LCP) + switch (cp->code) { + case LCP_PROTO_REJ: + pid = ntohs(*(__be16*)skb->data); + if (pid == PID_LCP || pid == PID_IPCP || + pid == PID_IPV6CP) + ppp_cp_event(dev, pid, RXJ_BAD, 0, 0, + 0, NULL); + goto out; + + case LCP_ECHO_REQ: /* send Echo-Reply */ + if (len >= 4 && proto->state == OPENED) + ppp_tx_cp(dev, PID_LCP, LCP_ECHO_REPLY, + cp->id, len - 4, skb->data + 4); + goto out; + + case LCP_ECHO_REPLY: + if (cp->id == ppp->echo_id) + ppp->last_pong = jiffies; + goto out; + + case LCP_DISC_REQ: /* discard */ + goto out; + } + + /* LCP, IPCP and IPV6CP */ + switch (cp->code) { + case CP_CONF_REQ: + ppp_cp_parse_cr(dev, pid, cp->id, len, skb->data); + goto out; + + case CP_CONF_ACK: + if (cp->id == proto->cr_id) + ppp_cp_event(dev, pid, RCA, 0, 0, 0, NULL); + goto out; + + case CP_CONF_REJ: + case CP_CONF_NAK: + if (cp->id == proto->cr_id) + ppp_cp_event(dev, pid, RCN, 0, 0, 0, NULL); + goto out; + + case CP_TERM_REQ: + ppp_cp_event(dev, pid, RTR, 0, cp->id, 0, NULL); + goto out; + + case CP_TERM_ACK: + ppp_cp_event(dev, pid, RTA, 0, 0, 0, NULL); + goto out; + + case CP_CODE_REJ: + ppp_cp_event(dev, pid, RXJ_BAD, 0, 0, 0, NULL); + goto out; + + default: + len += sizeof(struct cp_header); + if (len > dev->mtu) + len = dev->mtu; + ppp_cp_event(dev, pid, RUC, 0, 0, len, cp); + goto out; + } + goto out; + +rx_error: + dev->stats.rx_errors++; +out: + spin_unlock_irqrestore(&ppp->lock, flags); + dev_kfree_skb_any(skb); + ppp_tx_flush(); + return NET_RX_DROP; +} + + +static void ppp_timer(unsigned long arg) +{ + struct proto *proto = (struct proto *)arg; + struct ppp *ppp = get_ppp(proto->dev); + unsigned long flags; + + spin_lock_irqsave(&ppp->lock, flags); + switch (proto->state) { + case STOPPING: + case REQ_SENT: + case ACK_RECV: + case ACK_SENT: + if (proto->restart_counter) { + ppp_cp_event(proto->dev, proto->pid, TO_GOOD, 0, 0, + 0, NULL); + proto->restart_counter--; + } else + ppp_cp_event(proto->dev, proto->pid, TO_BAD, 0, 0, + 0, NULL); + break; + + case OPENED: + if (proto->pid != PID_LCP) + break; + if (time_after(jiffies, ppp->last_pong + + ppp->keepalive_timeout * HZ)) { + printk(KERN_INFO "%s: Link down\n", proto->dev->name); + ppp_cp_event(proto->dev, PID_LCP, STOP, 0, 0, 0, NULL); + ppp_cp_event(proto->dev, PID_LCP, START, 0, 0, 0, NULL); + } else { /* send keep-alive packet */ + ppp->echo_id = ++ppp->seq; + ppp_tx_cp(proto->dev, PID_LCP, LCP_ECHO_REQ, + ppp->echo_id, 0, NULL); + proto->timer.expires = jiffies + + ppp->keepalive_interval * HZ; + add_timer(&proto->timer); + } + break; + } + spin_unlock_irqrestore(&ppp->lock, flags); + ppp_tx_flush(); +} + + +static void ppp_start(struct net_device *dev) +{ + struct ppp *ppp = get_ppp(dev); + int i; + + for (i = 0; i < IDX_COUNT; i++) { + struct proto *proto = &ppp->protos[i]; + proto->dev = dev; + init_timer(&proto->timer); + proto->timer.function = ppp_timer; + proto->timer.data = (unsigned long)proto; + proto->state = CLOSED; + } + ppp->protos[IDX_LCP].pid = PID_LCP; + ppp->protos[IDX_IPCP].pid = PID_IPCP; + ppp->protos[IDX_IPV6CP].pid = PID_IPV6CP; + + ppp_cp_event(dev, PID_LCP, START, 0, 0, 0, NULL); +} + +static void ppp_stop(struct net_device *dev) +{ + ppp_cp_event(dev, PID_LCP, STOP, 0, 0, 0, NULL); +} static struct hdlc_proto proto = { - .open = ppp_open, - .close = ppp_close, + .start = ppp_start, + .stop = ppp_stop, .type_trans = ppp_type_trans, .ioctl = ppp_ioctl, + .netif_rx = ppp_rx, .module = THIS_MODULE, }; +static const struct header_ops ppp_header_ops = { + .create = ppp_hard_header, +}; static int ppp_ioctl(struct net_device *dev, struct ifreq *ifr) { hdlc_device *hdlc = dev_to_hdlc(dev); + struct ppp *ppp; int result; switch (ifr->ifr_settings.type) { @@ -109,25 +654,35 @@ static int ppp_ioctl(struct net_device *dev, struct ifreq *ifr) return 0; /* return protocol only, no settable parameters */ case IF_PROTO_PPP: - if(!capable(CAP_NET_ADMIN)) + if (!capable(CAP_NET_ADMIN)) return -EPERM; - if(dev->flags & IFF_UP) + if (dev->flags & IFF_UP) return -EBUSY; /* no settable parameters */ - result=hdlc->attach(dev, ENCODING_NRZ,PARITY_CRC16_PR1_CCITT); + result = hdlc->attach(dev, ENCODING_NRZ,PARITY_CRC16_PR1_CCITT); if (result) return result; - result = attach_hdlc_protocol(dev, &proto, - sizeof(struct ppp_state)); + result = attach_hdlc_protocol(dev, &proto, sizeof(struct ppp)); if (result) return result; + + ppp = get_ppp(dev); + spin_lock_init(&ppp->lock); + ppp->req_timeout = 2; + ppp->cr_retries = 10; + ppp->term_retries = 2; + ppp->keepalive_interval = 10; + ppp->keepalive_timeout = 60; + dev->hard_start_xmit = hdlc->xmit; + dev->hard_header_len = sizeof(struct hdlc_header); + dev->header_ops = &ppp_header_ops; dev->type = ARPHRD_PPP; - netif_dormant_off(dev); + netif_dormant_on(dev); return 0; } @@ -137,12 +692,11 @@ static int ppp_ioctl(struct net_device *dev, struct ifreq *ifr) static int __init mod_init(void) { + skb_queue_head_init(&tx_queue); register_hdlc_protocol(&proto); return 0; } - - static void __exit mod_exit(void) { unregister_hdlc_protocol(&proto); -- cgit v1.2.3-70-g09d2 From b20a9c24d5c5d466d7e4a25c6f1bedbd2d16ad4f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 23 Nov 2008 16:02:31 -0800 Subject: dccp: Set per-connection CCIDs via socket options With this patch, TX/RX CCIDs can now be changed on a per-connection basis, which overrides the defaults set by the global sysctl variables for TX/RX CCIDs. To make full use of this facility, the remaining patches of this patch set are needed, which track dependencies and activate negotiated feature values. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- Documentation/networking/dccp.txt | 14 ++++++++++++++ include/linux/dccp.h | 5 +++++ net/dccp/ackvec.c | 9 ++++----- net/dccp/ackvec.h | 5 ++--- net/dccp/feat.h | 2 ++ net/dccp/proto.c | 34 ++++++++++++++++++++++++++++++++++ 6 files changed, 61 insertions(+), 8 deletions(-) (limited to 'Documentation/networking') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 43df4487379..610083ff73f 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -61,6 +61,20 @@ DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs supported by the endpoint (see include/linux/dccp.h for symbolic constants). The caller needs to provide a sufficiently large (> 2) array of type uint8_t. +DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same +time, combining the operation of the next two socket options. This option is +preferrable over the latter two, since often applications will use the same +type of CCID for both directions; and mixed use of CCIDs is not currently well +understood. This socket option takes as argument at least one uint8_t value, or +an array of uint8_t values, which must match available CCIDS (see above). CCIDs +must be registered on the socket before calling connect() or listen(). + +DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets +the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. +Please note that the getsockopt argument type here is `int', not uint8_t. + +DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. + DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold timewait state when closing the connection (RFC 4340, 8.3). The usual case is that the closing server sends a CloseReq, whereupon the client holds timewait diff --git a/include/linux/dccp.h b/include/linux/dccp.h index eda389ce04f..6a72ff52a8a 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -168,6 +168,8 @@ enum { DCCPO_MIN_CCID_SPECIFIC = 128, DCCPO_MAX_CCID_SPECIFIC = 255, }; +/* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */ +#define DCCP_SINGLE_OPT_MAXLEN 253 /* DCCP CCIDS */ enum { @@ -203,6 +205,9 @@ enum dccp_feature_numbers { #define DCCP_SOCKOPT_SEND_CSCOV 10 #define DCCP_SOCKOPT_RECV_CSCOV 11 #define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 +#define DCCP_SOCKOPT_CCID 13 +#define DCCP_SOCKOPT_TX_CCID 14 +#define DCCP_SOCKOPT_RX_CCID 15 #define DCCP_SOCKOPT_CCID_RX_INFO 128 #define DCCP_SOCKOPT_CCID_TX_INFO 192 diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 1e8be246ad1..01e4d39fa23 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -12,7 +12,6 @@ #include "ackvec.h" #include "dccp.h" -#include #include #include #include @@ -68,7 +67,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) struct dccp_sock *dp = dccp_sk(sk); struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; /* Figure out how many options do we need to represent the ackvec */ - const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN); + const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN); u16 len = av->av_vec_len + 2 * nr_opts, i; u32 elapsed_time; const unsigned char *tail, *from; @@ -100,8 +99,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) for (i = 0; i < nr_opts; ++i) { int copylen = len; - if (len > DCCP_MAX_ACKVEC_OPT_LEN) - copylen = DCCP_MAX_ACKVEC_OPT_LEN; + if (len > DCCP_SINGLE_OPT_MAXLEN) + copylen = DCCP_SINGLE_OPT_MAXLEN; *to++ = DCCPO_ACK_VECTOR_0; *to++ = copylen + 2; @@ -432,7 +431,7 @@ found: int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, u64 *ackno, const u8 opt, const u8 *value, const u8 len) { - if (len > DCCP_MAX_ACKVEC_OPT_LEN) + if (len > DCCP_SINGLE_OPT_MAXLEN) return -1; /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index bcb64fb4ace..4ccee030524 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -11,15 +11,14 @@ * published by the Free Software Foundation. */ +#include #include #include #include #include -/* Read about the ECN nonce to see why it is 253 */ -#define DCCP_MAX_ACKVEC_OPT_LEN 253 /* We can spread an ack vector across multiple options */ -#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2) +#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2) #define DCCP_ACKVEC_STATE_RECEIVED 0 #define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 4d172822df1..093af1610d1 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -22,6 +22,8 @@ /* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ #define DCCPF_SEQ_WMIN 32 #define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull +/* Maximum number of SP values that fit in a single (Confirm) option */ +#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) enum dccp_feat_type { FEAT_AT_RX = 1, /* located at RX side of half-connection */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 8b63394ec24..445884cf1c2 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -501,6 +501,36 @@ static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) return rc; } +static int dccp_setsockopt_ccid(struct sock *sk, int type, + char __user *optval, int optlen) +{ + u8 *val; + int rc = 0; + + if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) + return -EINVAL; + + val = kmalloc(optlen, GFP_KERNEL); + if (val == NULL) + return -ENOMEM; + + if (copy_from_user(val, optval, optlen)) { + kfree(val); + return -EFAULT; + } + + lock_sock(sk); + if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) + rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); + + if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) + rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); + release_sock(sk); + + kfree(val); + return rc; +} + static int do_dccp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { @@ -515,6 +545,10 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_CHANGE_R: DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); return 0; + case DCCP_SOCKOPT_CCID: + case DCCP_SOCKOPT_RX_CCID: + case DCCP_SOCKOPT_TX_CCID: + return dccp_setsockopt_ccid(sk, optname, optval, optlen); } if (optlen < (int)sizeof(int)) -- cgit v1.2.3-70-g09d2 From b74ca3a896b9ab5f952bc440154758e708c48884 Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Mon, 8 Dec 2008 01:14:16 -0800 Subject: netdevice: Kill netdev->priv This is the last shoot of this series. After I removing all directly reference of netdev->priv, I am killing "priv" of "struct net_device" and fixing relative comments/docs. Anyone will not be allowed to reference netdev->priv directly. If you want to reference the memory of private data, use netdev_priv() instead. If the private data is not allocted when alloc_netdev(), use netdev->ml_priv to point that memory after you creating that private data. Signed-off-by: Wang Chen Signed-off-by: David S. Miller --- Documentation/networking/driver.txt | 2 +- Documentation/networking/netdevices.txt | 2 +- drivers/net/3c501.h | 2 +- drivers/net/atp.c | 2 +- drivers/net/eexpress.c | 2 +- drivers/net/forcedeth.c | 4 ++-- drivers/net/lance.c | 2 +- drivers/net/myri_sbus.c | 2 +- drivers/net/pci-skeleton.c | 2 +- drivers/net/sun3_82586.c | 2 +- drivers/net/sunbmac.c | 2 +- drivers/net/tokenring/3c359.c | 5 +++-- drivers/net/via-rhine.c | 9 +++++---- drivers/net/wireless/strip.c | 2 +- include/linux/hdlc.h | 2 +- include/linux/netdevice.h | 1 - net/atm/mpc.c | 4 ++-- net/core/dev.c | 6 ------ 18 files changed, 24 insertions(+), 29 deletions(-) (limited to 'Documentation/networking') diff --git a/Documentation/networking/driver.txt b/Documentation/networking/driver.txt index ea72d2e66ca..03283daa64f 100644 --- a/Documentation/networking/driver.txt +++ b/Documentation/networking/driver.txt @@ -13,7 +13,7 @@ Transmit path guidelines: static int drv_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { - struct drv *dp = dev->priv; + struct drv *dp = netdev_priv(dev); lock_tx(dp); ... diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt index d0f71fc7f78..a2ab6a0b116 100644 --- a/Documentation/networking/netdevices.txt +++ b/Documentation/networking/netdevices.txt @@ -18,7 +18,7 @@ There are routines in net_init.c to handle the common cases of alloc_etherdev, alloc_netdev. These reserve extra space for driver private data which gets freed when the network device is freed. If separately allocated data is attached to the network device -(dev->priv) then it is up to the module exit handler to free that. +(netdev_priv(dev)) then it is up to the module exit handler to free that. MTU === diff --git a/drivers/net/3c501.h b/drivers/net/3c501.h index cfec64efff7..f40b0493337 100644 --- a/drivers/net/3c501.h +++ b/drivers/net/3c501.h @@ -23,7 +23,7 @@ static const struct ethtool_ops netdev_ethtool_ops; static int el_debug = EL_DEBUG; /* - * Board-specific info in dev->priv. + * Board-specific info in netdev_priv(dev). */ struct net_local diff --git a/drivers/net/atp.c b/drivers/net/atp.c index 7028b276dfd..1d6b74c5d6c 100644 --- a/drivers/net/atp.c +++ b/drivers/net/atp.c @@ -420,7 +420,7 @@ static unsigned short __init eeprom_op(long ioaddr, u32 cmd) registers that "should" only need to be set once at boot, so that there is non-reboot way to recover if something goes wrong. - This is an attachable device: if there is no dev->priv entry then it wasn't + This is an attachable device: if there is no private entry then it wasn't probed for at boot-time, and we need to probe for it again. */ static int net_open(struct net_device *dev) diff --git a/drivers/net/eexpress.c b/drivers/net/eexpress.c index a125e41240f..9ff3f2f5e38 100644 --- a/drivers/net/eexpress.c +++ b/drivers/net/eexpress.c @@ -1046,7 +1046,7 @@ static void eexp_hw_tx_pio(struct net_device *dev, unsigned short *buf, /* * Sanity check the suspected EtherExpress card * Read hardware address, reset card, size memory and initialize buffer - * memory pointers. These are held in dev->priv, in case someone has more + * memory pointers. These are held in netdev_priv(), in case someone has more * than one card in a machine. */ diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c index 12384df8cb2..1f2b24743ee 100644 --- a/drivers/net/forcedeth.c +++ b/drivers/net/forcedeth.c @@ -712,12 +712,12 @@ struct nv_skb_map { /* * SMP locking: - * All hardware access under dev->priv->lock, except the performance + * All hardware access under netdev_priv(dev)->lock, except the performance * critical parts: * - rx is (pseudo-) lockless: it relies on the single-threading provided * by the arch code for interrupts. * - tx setup is lockless: it relies on netif_tx_lock. Actual submission - * needs dev->priv->lock :-( + * needs netdev_priv(dev)->lock :-( * - set_multicast_list: preparation lockless, relies on netif_tx_lock. */ diff --git a/drivers/net/lance.c b/drivers/net/lance.c index e81b6113ed9..d7afb938ea6 100644 --- a/drivers/net/lance.c +++ b/drivers/net/lance.c @@ -519,7 +519,7 @@ static int __init lance_probe1(struct net_device *dev, int ioaddr, int irq, int } } - /* We can't allocate dev->priv from alloc_etherdev() because it must + /* We can't allocate private data from alloc_etherdev() because it must a ISA DMA-able region. */ chipname = chip_table[lance_version].name; printk("%s: %s at %#3x, ", dev->name, chipname, ioaddr); diff --git a/drivers/net/myri_sbus.c b/drivers/net/myri_sbus.c index 6833f65f8ae..899ed065a14 100644 --- a/drivers/net/myri_sbus.c +++ b/drivers/net/myri_sbus.c @@ -1091,7 +1091,7 @@ static int __devinit myri_sbus_probe(struct of_device *op, const struct of_devic err_free_irq: free_irq(dev->irq, dev); err: - /* This will also free the co-allocated 'dev->priv' */ + /* This will also free the co-allocated private data*/ free_netdev(dev); return -ENODEV; } diff --git a/drivers/net/pci-skeleton.c b/drivers/net/pci-skeleton.c index b23b5c397b1..c95fd72c3bb 100644 --- a/drivers/net/pci-skeleton.c +++ b/drivers/net/pci-skeleton.c @@ -781,7 +781,7 @@ static int __devinit netdrv_init_one (struct pci_dev *pdev, dev->irq = pdev->irq; dev->base_addr = (unsigned long) ioaddr; - /* dev->priv/tp zeroed and aligned in alloc_etherdev */ + /* netdev_priv()/tp zeroed and aligned in alloc_etherdev */ tp = netdev_priv(dev); /* note: tp->chipset set in netdrv_init_board */ diff --git a/drivers/net/sun3_82586.c b/drivers/net/sun3_82586.c index e8f97d5c9c2..e0d84772771 100644 --- a/drivers/net/sun3_82586.c +++ b/drivers/net/sun3_82586.c @@ -209,7 +209,7 @@ static int sun3_82586_open(struct net_device *dev) static int check586(struct net_device *dev,char *where,unsigned size) { struct priv pb; - struct priv *p = /* (struct priv *) dev->priv*/ &pb; + struct priv *p = &pb; char *iscp_addr; int i; diff --git a/drivers/net/sunbmac.c b/drivers/net/sunbmac.c index 977b3e08bbf..7f69c7f176c 100644 --- a/drivers/net/sunbmac.c +++ b/drivers/net/sunbmac.c @@ -1233,7 +1233,7 @@ fail_and_cleanup: bp->bmac_block, bp->bblock_dvma); - /* This also frees the co-located 'dev->priv' */ + /* This also frees the co-located private data */ free_netdev(dev); return -ENODEV; } diff --git a/drivers/net/tokenring/3c359.c b/drivers/net/tokenring/3c359.c index e7a944657cf..43853e3b210 100644 --- a/drivers/net/tokenring/3c359.c +++ b/drivers/net/tokenring/3c359.c @@ -296,8 +296,9 @@ static int __devinit xl_probe(struct pci_dev *pdev, } ; /* - * Allowing init_trdev to allocate the dev->priv structure will align xl_private - * on a 32 bytes boundary which we need for the rx/tx descriptors + * Allowing init_trdev to allocate the private data will align + * xl_private on a 32 bytes boundary which we need for the rx/tx + * descriptors */ dev = alloc_trdev(sizeof(struct xl_private)) ; diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c index 93b74b7b707..8d405c83df8 100644 --- a/drivers/net/via-rhine.c +++ b/drivers/net/via-rhine.c @@ -191,12 +191,13 @@ IIId. Synchronization The driver runs as two independent, single-threaded flows of control. One is the send-packet routine, which enforces single-threaded use by the -dev->priv->lock spinlock. The other thread is the interrupt handler, which -is single threaded by the hardware and interrupt handling software. +netdev_priv(dev)->lock spinlock. The other thread is the interrupt handler, +which is single threaded by the hardware and interrupt handling software. The send packet thread has partial control over the Tx ring. It locks the -dev->priv->lock whenever it's queuing a Tx packet. If the next slot in the ring -is not available it stops the transmit queue by calling netif_stop_queue. +netdev_priv(dev)->lock whenever it's queuing a Tx packet. If the next slot in +the ring is not available it stops the transmit queue by +calling netif_stop_queue. The interrupt handler has exclusive control over the Rx ring and records stats from the Tx ring. After reaping the stats, it marks the Tx queue entry as diff --git a/drivers/net/wireless/strip.c b/drivers/net/wireless/strip.c index 692e6c5e009..dd0de3a9ed4 100644 --- a/drivers/net/wireless/strip.c +++ b/drivers/net/wireless/strip.c @@ -2494,7 +2494,7 @@ static void strip_dev_setup(struct net_device *dev) dev->type = ARPHRD_METRICOM; /* dtang */ dev->hard_header_len = sizeof(STRIP_Header); /* - * dev->priv Already holds a pointer to our struct strip + * netdev_priv(dev) Already holds a pointer to our struct strip */ *(MetricomAddress *) & dev->broadcast = broadcast_address; diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h index e960faac609..fd47a151665 100644 --- a/include/linux/hdlc.h +++ b/include/linux/hdlc.h @@ -43,7 +43,7 @@ struct hdlc_proto { }; -/* Pointed to by dev->priv */ +/* Pointed to by netdev_priv(dev) */ typedef struct hdlc_device { /* used by HDLC layer to take control over HDLC device from hw driver*/ int (*attach)(struct net_device *dev, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0df0db068ac..47e73152831 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -785,7 +785,6 @@ struct net_device /* * One part is mostly used on xmit path (device) */ - void *priv; /* pointer to private data */ /* These may be needed for future network-power-down code. */ unsigned long trans_start; /* Time (in jiffies) of last Tx */ diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 12e9ea371db..039d5cc72c3 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -341,8 +341,8 @@ static const char *mpoa_device_type_string(char type) } /* - * lec device calls this via its dev->priv->lane2_ops->associate_indicator() - * when it sees a TLV in LE_ARP packet. + * lec device calls this via its netdev_priv(dev)->lane2_ops + * ->associate_indicator() when it sees a TLV in LE_ARP packet. * We fill in the pointer above when we see a LANE2 lec initializing * See LANE2 spec 3.1.5 * diff --git a/net/core/dev.c b/net/core/dev.c index 4615e9a443a..f54cac76438 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4378,12 +4378,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->num_tx_queues = queue_count; dev->real_num_tx_queues = queue_count; - if (sizeof_priv) { - dev->priv = ((char *)dev + - ((sizeof(struct net_device) + NETDEV_ALIGN_CONST) - & ~NETDEV_ALIGN_CONST)); - } - dev->gso_max_size = GSO_MAX_SIZE; netdev_init_queues(dev); -- cgit v1.2.3-70-g09d2 From 0049bab5e765aa74cf767a834fa336e19453fc5e Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 8 Dec 2008 01:18:05 -0800 Subject: dccp: Remove obsolete parts of the old CCID interface The TX/RX CCIDs of the minisock are now redundant: similar to the Ack Vector case, their value equals initially that of the sysctl, but at the end of feature negotiation may be something different. The old interface removed by this patch thus has been replaced by the newer interface to dynamically query the currently loaded CCIDs. Also removed are the constructors for the TX CCID and the RX CCID, since the switch "rx <-> non-rx" is done by the handler in minisocks.c (and the handler is the only place in the code where CCIDs are loaded). Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: David S. Miller --- Documentation/networking/dccp.txt | 5 +++-- include/linux/dccp.h | 3 --- net/dccp/ccid.c | 14 -------------- net/dccp/ccid.h | 5 ----- net/dccp/feat.c | 13 ------------- net/dccp/minisocks.c | 2 -- 6 files changed, 3 insertions(+), 39 deletions(-) (limited to 'Documentation/networking') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 610083ff73f..a203d132dbe 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -140,10 +140,11 @@ send_ackvec = 1 Whether or not to send Ack Vector options (sec. 11.5). tx_ccid = 2 - Default CCID for the sender-receiver half-connection. + Default CCID for the sender-receiver half-connection. Depending on the + choice of CCID, the Send Ack Vector feature is enabled automatically. rx_ccid = 2 - Default CCID for the receiver-sender half-connection. + Default CCID for the receiver-sender half-connection; see tx_ccid. seq_window = 100 The initial sequence window (sec. 7.5.2). diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 6a72ff52a8a..46daea312d9 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -370,7 +370,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * Will be used to pass the state from dccp_request_sock to dccp_sock. * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - * @dccpms_ccid - Congestion Control Id (CCID) (section 10) * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) * @dccpms_pending - List of features being negotiated @@ -378,8 +377,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) */ struct dccp_minisock { __u64 dccpms_sequence_window; - __u8 dccpms_rx_ccid; - __u8 dccpms_tx_ccid; __u8 dccpms_send_ack_vector; __u8 dccpms_send_ndp_count; struct list_head dccpms_pending; diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 647cb0614f8..bcc643f992a 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -253,20 +253,6 @@ out_module_put: EXPORT_SYMBOL_GPL(ccid_new); -struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp) -{ - return ccid_new(id, sk, 1, gfp); -} - -EXPORT_SYMBOL_GPL(ccid_hc_rx_new); - -struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp) -{ - return ccid_new(id, sk, 0, gfp); -} - -EXPORT_SYMBOL_GPL(ccid_hc_tx_new); - static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) { struct ccid_operations *ccid_ops; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 803343aed00..18f69423a70 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -111,11 +111,6 @@ extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp); -extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, - gfp_t gfp); -extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk, - gfp_t gfp); - static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) { struct ccid *ccid = dp->dccps_hc_rx_ccid; diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 077f78d579c..a0d5891a37b 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1124,22 +1124,9 @@ int dccp_feat_init(struct sock *sk) INIT_LIST_HEAD(&dmsk->dccpms_pending); /* XXX no longer used */ INIT_LIST_HEAD(&dmsk->dccpms_conf); /* XXX no longer used */ - /* CCID L */ - rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 1, 0, - &dmsk->dccpms_tx_ccid, 1); - if (rc) - goto out; - - /* CCID R */ - rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 0, 0, - &dmsk->dccpms_rx_ccid, 1); - if (rc) - goto out; - /* Ack ratio */ rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0, dp->dccps_l_ack_ratio); -out: return rc; } diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 308b6b928c3..210c346899b 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -45,8 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row); void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; - dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid; - dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid; dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; } -- cgit v1.2.3-70-g09d2 From 4098dce5be537a157eed4a326efd464109825b8b Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 8 Dec 2008 01:18:37 -0800 Subject: dccp: Remove manual influence on NDP Count feature Updating the NDP count feature is handled automatically now: * for CCID-2 it is disabled, since the code does not use NDP counts; * for CCID-3 it is enabled, as NDP counts are used to determine loss lengths. Allowing the user to change NDP values leads to unpredictable and failing behaviour, since it is then possible to disable NDP counts even when they are needed (e.g. in CCID-3). This means that only those user settings are sensible that agree with the values for Send NDP Count implied by the choice of CCID. But those settings are already activated by the feature negotiation (CCID dependency tracking), hence this form of support is redundant. At startup the initialisation of the NDP count feature uses the default value of 0, which is done implicitly by the zeroing-out of the socket when it is allocated. If the choice of CCID or feature negotiation enables NDP count, this will then be updated via the NDP activation handler. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: David S. Miller --- Documentation/networking/dccp.txt | 3 --- include/linux/dccp.h | 4 ++-- net/dccp/dccp.h | 1 - net/dccp/feat.c | 2 +- net/dccp/minisocks.c | 1 - net/dccp/options.c | 4 +--- net/dccp/sysctl.c | 7 ------- 7 files changed, 4 insertions(+), 18 deletions(-) (limited to 'Documentation/networking') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index a203d132dbe..1403745ab40 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -133,9 +133,6 @@ retries2 importance for retransmitted acknowledgments and feature negotiation, data packets are never retransmitted. Analogue of tcp_retries2. -send_ndp = 1 - Whether or not to send NDP count options (sec. 7.7.2). - send_ackvec = 1 Whether or not to send Ack Vector options (sec. 11.5). diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 46daea312d9..60e94438ead 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -371,14 +371,12 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) - * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) * @dccpms_pending - List of features being negotiated * @dccpms_conf - */ struct dccp_minisock { __u64 dccpms_sequence_window; __u8 dccpms_send_ack_vector; - __u8 dccpms_send_ndp_count; struct list_head dccpms_pending; struct list_head dccpms_conf; }; @@ -490,6 +488,7 @@ struct dccp_ackvec; * @dccps_r_ack_ratio - feature-remote Ack Ratio * @dccps_pcslen - sender partial checksum coverage (via sockopt) * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) + * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) * @dccps_ndp_count - number of Non Data Packets since last data packet * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) @@ -529,6 +528,7 @@ struct dccp_sock { __u16 dccps_r_ack_ratio; __u8 dccps_pcslen:4; __u8 dccps_pcrlen:4; + __u8 dccps_send_ndp_count:1; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; struct dccp_minisock dccps_minisock; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 94f6785f81e..e0759d098b9 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -99,7 +99,6 @@ extern int sysctl_dccp_feat_sequence_window; extern int sysctl_dccp_feat_rx_ccid; extern int sysctl_dccp_feat_tx_ccid; extern int sysctl_dccp_feat_send_ack_vector; -extern int sysctl_dccp_feat_send_ndp_count; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; diff --git a/net/dccp/feat.c b/net/dccp/feat.c index a0d5891a37b..30f9fb76b92 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -85,7 +85,7 @@ static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) { if (!rx) - dccp_msk(sk)->dccpms_send_ndp_count = (enable > 0); + dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); return 0; } diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 210c346899b..02b14812464 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -46,7 +46,6 @@ void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; - dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; } void dccp_time_wait(struct sock *sk, int state, int timeo) diff --git a/net/dccp/options.c b/net/dccp/options.c index debb1008c7a..e9d674874b4 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -27,7 +27,6 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; -int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; u64 dccp_decode_value_var(const u8 *bf, const u8 len) { @@ -531,8 +530,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - if (dmsk->dccpms_send_ndp_count && - dccp_insert_option_ndp(sk, skb)) + if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) return -1; if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index f6e54f433e2..587c12f915c 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -47,13 +47,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "send_ndp", - .data = &sysctl_dccp_feat_send_ndp_count, - .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "request_retries", .data = &sysctl_dccp_request_retries, -- cgit v1.2.3-70-g09d2 From 6fdd34d43bff8be9bb925b49d87a0ee144d2ab07 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 8 Dec 2008 01:19:06 -0800 Subject: dccp ccid-2: Phase out the use of boolean Ack Vector sysctl This removes the use of the sysctl and the minisock variable for the Send Ack Vector feature, as it now is handled fully dynamically via feature negotiation (i.e. when CCID-2 is enabled, Ack Vectors are automatically enabled as per RFC 4341, 4.). Using a sysctl in parallel to this implementation would open the door to crashes, since much of the code relies on tests of the boolean minisock / sysctl variable. Thus, this patch replaces all tests of type if (dccp_msk(sk)->dccpms_send_ack_vector) /* ... */ with if (dp->dccps_hc_rx_ackvec != NULL) /* ... */ The dccps_hc_rx_ackvec is allocated by the dccp_hdlr_ackvec() when feature negotiation concluded that Ack Vectors are to be used on the half-connection. Otherwise, it is NULL (due to dccp_init_sock/dccp_create_openreq_child), so that the test is a valid one. The activation handler for Ack Vectors is called as soon as the feature negotiation has concluded at the * server when the Ack marking the transition RESPOND => OPEN arrives; * client after it has sent its ACK, marking the transition REQUEST => PARTOPEN. Adding the sequence number of the Response packet to the Ack Vector has been removed, since (a) connection establishment implies that the Response has been received; (b) the CCIDs only look at packets received in the (PART)OPEN state, i.e. this entry will always be ignored; (c) it can not be used for anything useful - to detect loss for instance, only packets received after the loss can serve as pseudo-dupacks. There was a FIXME to change the error code when dccp_ackvec_add() fails. I removed this after finding out that: * the check whether ackno < ISN is already made earlier, * this Response is likely the 1st packet with an Ackno that the client gets, * so when dccp_ackvec_add() fails, the reason is likely not a packet error. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: David S. Miller --- Documentation/networking/dccp.txt | 3 --- include/linux/dccp.h | 3 --- net/dccp/dccp.h | 3 +-- net/dccp/diag.c | 2 +- net/dccp/input.c | 12 +++--------- net/dccp/minisocks.c | 1 - net/dccp/options.c | 7 ++----- net/dccp/proto.c | 3 +-- net/dccp/sysctl.c | 7 ------- 9 files changed, 8 insertions(+), 33 deletions(-) (limited to 'Documentation/networking') diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 1403745ab40..7a3bb1abb83 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -133,9 +133,6 @@ retries2 importance for retransmitted acknowledgments and feature negotiation, data packets are never retransmitted. Analogue of tcp_retries2. -send_ackvec = 1 - Whether or not to send Ack Vector options (sec. 11.5). - tx_ccid = 2 Default CCID for the sender-receiver half-connection. Depending on the choice of CCID, the Send Ack Vector feature is enabled automatically. diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 60e94438ead..61734e27abb 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -360,7 +360,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) #define DCCPF_INITIAL_SEQUENCE_WINDOW 100 #define DCCPF_INITIAL_ACK_RATIO 2 #define DCCPF_INITIAL_CCID DCCPC_CCID2 -#define DCCPF_INITIAL_SEND_ACK_VECTOR 1 /* FIXME: for now we're default to 1 but it should really be 0 */ #define DCCPF_INITIAL_SEND_NDP_COUNT 1 @@ -370,13 +369,11 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * Will be used to pass the state from dccp_request_sock to dccp_sock. * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) * @dccpms_pending - List of features being negotiated * @dccpms_conf - */ struct dccp_minisock { __u64 dccpms_sequence_window; - __u8 dccpms_send_ack_vector; struct list_head dccpms_pending; struct list_head dccpms_conf; }; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index e0759d098b9..0bc4c9a02e1 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -98,7 +98,6 @@ extern int sysctl_dccp_retries2; extern int sysctl_dccp_feat_sequence_window; extern int sysctl_dccp_feat_rx_ccid; extern int sysctl_dccp_feat_tx_ccid; -extern int sysctl_dccp_feat_send_ack_vector; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; @@ -434,7 +433,7 @@ static inline int dccp_ack_pending(const struct sock *sk) const struct dccp_sock *dp = dccp_sk(sk); return dp->dccps_timestamp_echo != 0 || #ifdef CONFIG_IP_DCCP_ACKVEC - (dccp_msk(sk)->dccpms_send_ack_vector && + (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || #endif inet_csk_ack_scheduled(sk); diff --git a/net/dccp/diag.c b/net/dccp/diag.c index d1e100395ef..652a1b67f72 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_backoff = icsk->icsk_backoff; info->tcpi_pmtu = icsk->icsk_pmtu_cookie; - if (dccp_msk(sk)->dccpms_send_ack_vector) + if (dp->dccps_hc_rx_ackvec != NULL) info->tcpi_options |= TCPI_OPT_SACK; ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); diff --git a/net/dccp/input.c b/net/dccp/input.c index 0672b7e1763..5eb443f656c 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -163,7 +163,7 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); - if (dccp_msk(sk)->dccpms_send_ack_vector) + if (dp->dccps_hc_rx_ackvec != NULL) dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_ack_seq); } @@ -375,7 +375,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) @@ -434,12 +434,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - dp->dccps_options_received.dccpor_timestamp_echo)); - if (dccp_msk(sk)->dccpms_send_ack_vector && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_ACKVEC_STATE_RECEIVED)) - goto out_invalid_packet; /* FIXME: change error code */ - /* Stop the REQUEST timer */ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); WARN_ON(sk->sk_send_head == NULL); @@ -637,7 +631,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 02b14812464..6821ae33dd3 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -45,7 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row); void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; - dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; } void dccp_time_wait(struct sock *sk, int state, int timeo) diff --git a/net/dccp/options.c b/net/dccp/options.c index e9d674874b4..7b1165c21f5 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -26,7 +26,6 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; -int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; u64 dccp_decode_value_var(const u8 *bf, const u8 len) { @@ -145,8 +144,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, case DCCPO_ACK_VECTOR_1: if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ break; - - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) goto out_invalid_option; break; @@ -526,7 +524,6 @@ static void dccp_insert_option_padding(struct sk_buff *skb) int dccp_insert_options(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); DCCP_SKB_CB(skb)->dccpd_opt_len = 0; @@ -547,7 +544,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) if (dccp_insert_option_timestamp(sk, skb)) return -1; - } else if (dmsk->dccpms_send_ack_vector && + } else if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && dccp_insert_option_ackvec(sk, skb)) { return -1; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 0941f8fe167..d5c2bacb713 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -201,7 +201,6 @@ EXPORT_SYMBOL_GPL(dccp_init_sock); void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); /* * DCCP doesn't use sk_write_queue, just sk_send_head @@ -219,7 +218,7 @@ void dccp_destroy_sock(struct sock *sk) kfree(dp->dccps_service_list); dp->dccps_service_list = NULL; - if (dmsk->dccpms_send_ack_vector) { + if (dp->dccps_hc_rx_ackvec != NULL) { dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; } diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 587c12f915c..018e210875e 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -40,13 +40,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "send_ackvec", - .data = &sysctl_dccp_feat_send_ack_vector, - .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "request_retries", .data = &sysctl_dccp_request_retries, -- cgit v1.2.3-70-g09d2