From 65f5c7c1143fb8eed5bc7e7d8c926346e00fe3c0 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Mon, 20 Mar 2006 16:55:08 -0800 Subject: [IPV6]: ROUTE: Add accept_ra_defrtr sysctl. This controls whether we accept default router information in RAs. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/linux/sysctl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sysctl.h') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index bac61db2645..0f494137d03 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -531,6 +531,7 @@ enum { NET_IPV6_MAX_DESYNC_FACTOR=15, NET_IPV6_MAX_ADDRESSES=16, NET_IPV6_FORCE_MLD_VERSION=17, + NET_IPV6_ACCEPT_RA_DEFRTR=18, __NET_IPV6_MAX }; -- cgit v1.2.3-70-g09d2 From c4fd30eb18666972230689eb30e8f90844bce635 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Mon, 20 Mar 2006 16:55:26 -0800 Subject: [IPV6]: ADDRCONF: Add accept_ra_pinfo sysctl. This controls whether we accept Prefix Information in RAs. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 10 ++++++++-- include/linux/ipv6.h | 2 ++ include/linux/sysctl.h | 1 + net/ipv6/addrconf.c | 11 +++++++++++ net/ipv6/ndisc.c | 2 +- 5 files changed, 23 insertions(+), 3 deletions(-) (limited to 'include/linux/sysctl.h') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 8001faa76ea..404afacb468 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -723,6 +723,12 @@ accept_ra_defrtr - BOOLEAN Functional default: enabled if accept_ra is enabled. disabled if accept_ra is disabled. +accept_ra_pinfo - BOOLEAN + Learn Prefix Inforamtion in Router Advertisement. + + Functional default: enabled if accept_ra is enabled. + disabled if accept_ra is disabled. + accept_redirects - BOOLEAN Accept Redirects. @@ -733,8 +739,8 @@ autoconf - BOOLEAN Autoconfigure addresses using Prefix Information in Router Advertisements. - Functional default: enabled if accept_ra is enabled. - disabled if accept_ra is disabled. + Functional default: enabled if accept_ra_pinfo is enabled. + disabled if accept_ra_pinfo is disabled. dad_transmits - INTEGER The amount of Duplicate Address Detection probes to send. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index c5131a02869..2c3b799480c 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -146,6 +146,7 @@ struct ipv6_devconf { #endif __s32 max_addresses; __s32 accept_ra_defrtr; + __s32 accept_ra_pinfo; void *sysctl; }; @@ -169,6 +170,7 @@ enum { DEVCONF_MAX_ADDRESSES, DEVCONF_FORCE_MLD_VERSION, DEVCONF_ACCEPT_RA_DEFRTR, + DEVCONF_ACCEPT_RA_PINFO, DEVCONF_MAX }; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 0f494137d03..09378ea505b 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -532,6 +532,7 @@ enum { NET_IPV6_MAX_ADDRESSES=16, NET_IPV6_FORCE_MLD_VERSION=17, NET_IPV6_ACCEPT_RA_DEFRTR=18, + NET_IPV6_ACCEPT_RA_PINFO=19, __NET_IPV6_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index fbcdcc6ba93..631b51d0ccb 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -166,6 +166,7 @@ struct ipv6_devconf ipv6_devconf = { #endif .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .accept_ra_pinfo = 1, }; static struct ipv6_devconf ipv6_devconf_dflt = { @@ -188,6 +189,7 @@ static struct ipv6_devconf ipv6_devconf_dflt = { #endif .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, + .accept_ra_pinfo = 1, }; /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ @@ -3119,6 +3121,7 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf, #endif array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; + array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; } static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, @@ -3579,6 +3582,14 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = NET_IPV6_ACCEPT_RA_PINFO, + .procname = "accept_ra_pinfo", + .data = &ipv6_devconf.accept_ra_pinfo, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0, /* sentinel */ } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index e1711679605..3b56be85234 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1191,7 +1191,7 @@ skip_defrtr: NEIGH_UPDATE_F_ISROUTER); } - if (ndopts.nd_opts_pi) { + if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_pi; p; -- cgit v1.2.3-70-g09d2 From 930d6ff2e2a5f1538448d3b0b2652a8f0c0f6cba Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Mon, 20 Mar 2006 17:05:30 -0800 Subject: [IPV6]: ROUTE: Add accept_ra_rtr_pref sysctl. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 6 ++++++ include/linux/ipv6.h | 4 ++++ include/linux/sysctl.h | 1 + net/ipv6/addrconf.c | 19 +++++++++++++++++++ net/ipv6/ndisc.c | 3 ++- 5 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux/sysctl.h') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 404afacb468..87bbd774c2b 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -729,6 +729,12 @@ accept_ra_pinfo - BOOLEAN Functional default: enabled if accept_ra is enabled. disabled if accept_ra is disabled. +accept_ra_rtr_pref - BOOLEAN + Accept Router Preference in RA. + + Functional default: enabled if accept_ra is enabled. + disabled if accept_ra is disabled. + accept_redirects - BOOLEAN Accept Redirects. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 2c3b799480c..108b75dccd9 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -147,6 +147,9 @@ struct ipv6_devconf { __s32 max_addresses; __s32 accept_ra_defrtr; __s32 accept_ra_pinfo; +#ifdef CONFIG_IPV6_ROUTER_PREF + __s32 accept_ra_rtr_pref; +#endif void *sysctl; }; @@ -171,6 +174,7 @@ enum { DEVCONF_FORCE_MLD_VERSION, DEVCONF_ACCEPT_RA_DEFRTR, DEVCONF_ACCEPT_RA_PINFO, + DEVCONF_ACCEPT_RA_RTR_PREF, DEVCONF_MAX }; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 09378ea505b..236f537b38d 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -533,6 +533,7 @@ enum { NET_IPV6_FORCE_MLD_VERSION=17, NET_IPV6_ACCEPT_RA_DEFRTR=18, NET_IPV6_ACCEPT_RA_PINFO=19, + NET_IPV6_ACCEPT_RA_RTR_PREF=20, __NET_IPV6_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 631b51d0ccb..51edba5fea2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -167,6 +167,9 @@ struct ipv6_devconf ipv6_devconf = { .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, .accept_ra_pinfo = 1, +#ifdef CONFIG_IPV6_ROUTER_PREF + .accept_ra_rtr_pref = 1, +#endif }; static struct ipv6_devconf ipv6_devconf_dflt = { @@ -190,6 +193,9 @@ static struct ipv6_devconf ipv6_devconf_dflt = { .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, .accept_ra_pinfo = 1, +#ifdef CONFIG_IPV6_ROUTER_PREF + .accept_ra_rtr_pref = 1, +#endif }; /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ @@ -3122,6 +3128,9 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; +#ifdef CONFIG_IPV6_ROUTER_PREF + array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; +#endif } static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, @@ -3590,6 +3599,16 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_IPV6_ROUTER_PREF + { + .ctl_name = NET_IPV6_ACCEPT_RA_RTR_PREF, + .procname = "accept_ra_rtr_pref", + .data = &ipv6_devconf.accept_ra_rtr_pref, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0, /* sentinel */ } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 966ab6b3022..f4462ee3302 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1090,7 +1090,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) #ifdef CONFIG_IPV6_ROUTER_PREF pref = ra_msg->icmph.icmp6_router_pref; /* 10b is handled as if it were 00b (medium) */ - if (pref == ICMPV6_ROUTER_PREF_INVALID) + if (pref == ICMPV6_ROUTER_PREF_INVALID || + in6_dev->cnf.accept_ra_rtr_pref) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif -- cgit v1.2.3-70-g09d2 From 52e1635631b342803aecaf81a362c1464e3da2e5 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Mon, 20 Mar 2006 17:05:47 -0800 Subject: [IPV6]: ROUTE: Add router_probe_interval sysctl. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 6 ++++++ include/linux/ipv6.h | 2 ++ include/linux/sysctl.h | 1 + net/ipv6/addrconf.c | 12 ++++++++++++ net/ipv6/route.c | 2 +- 5 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux/sysctl.h') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 87bbd774c2b..88efed0a533 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -789,6 +789,12 @@ mtu - INTEGER Default Maximum Transfer Unit Default: 1280 (IPv6 required minimum) +router_probe_interval - INTEGER + Minimum interval (in seconds) between Router Probing described + in RFC4191. + + Default: 60 + router_solicitation_delay - INTEGER Number of seconds to wait after interface is brought up before sending Router Solicitations. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 108b75dccd9..c609cc70237 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -149,6 +149,7 @@ struct ipv6_devconf { __s32 accept_ra_pinfo; #ifdef CONFIG_IPV6_ROUTER_PREF __s32 accept_ra_rtr_pref; + __s32 rtr_probe_interval; #endif void *sysctl; }; @@ -175,6 +176,7 @@ enum { DEVCONF_ACCEPT_RA_DEFRTR, DEVCONF_ACCEPT_RA_PINFO, DEVCONF_ACCEPT_RA_RTR_PREF, + DEVCONF_RTR_PROBE_INTERVAL, DEVCONF_MAX }; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 236f537b38d..f49488ffefe 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -534,6 +534,7 @@ enum { NET_IPV6_ACCEPT_RA_DEFRTR=18, NET_IPV6_ACCEPT_RA_PINFO=19, NET_IPV6_ACCEPT_RA_RTR_PREF=20, + NET_IPV6_RTR_PROBE_INTERVAL=21, __NET_IPV6_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 51edba5fea2..e7add61e6e3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -169,6 +169,7 @@ struct ipv6_devconf ipv6_devconf = { .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, + .rtr_probe_interval = 60 * HZ, #endif }; @@ -195,6 +196,7 @@ static struct ipv6_devconf ipv6_devconf_dflt = { .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, + .rtr_probe_interval = 60 * HZ, #endif }; @@ -3130,6 +3132,7 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; #ifdef CONFIG_IPV6_ROUTER_PREF array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; + array[DEVCONF_RTR_PROBE_INTERVAL] = cnf->rtr_probe_interval; #endif } @@ -3608,6 +3611,15 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = NET_IPV6_RTR_PROBE_INTERVAL, + .procname = "router_probe_interval", + .data = &ipv6_devconf.rtr_probe_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, #endif { .ctl_name = 0, /* sentinel */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8ba8900c0a5..c797b9bbb7d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -234,7 +234,7 @@ static void rt6_probe(struct rt6_info *rt) return; read_lock_bh(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && - time_after(jiffies, neigh->updated + 60 * HZ)) { + time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) { struct in6_addr mcaddr; struct in6_addr *target; -- cgit v1.2.3-70-g09d2 From 09c884d4c3b45cda904c2291d4723074ff523611 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Mon, 20 Mar 2006 17:07:03 -0800 Subject: [IPV6]: ROUTE: Add accept_ra_rt_info_max_plen sysctl. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 9 +++++++++ include/linux/ipv6.h | 4 ++++ include/linux/sysctl.h | 1 + net/ipv6/addrconf.c | 19 +++++++++++++++++++ net/ipv6/ndisc.c | 4 +++- 5 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include/linux/sysctl.h') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 88efed0a533..35aed1c6dd9 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -729,6 +729,15 @@ accept_ra_pinfo - BOOLEAN Functional default: enabled if accept_ra is enabled. disabled if accept_ra is disabled. +accept_ra_rt_info_max_plen - INTEGER + Maximum prefix length of Route Information in RA. + + Route Information w/ prefix larger than or equal to this + variable shall be ignored. + + Functional default: 0 if accept_ra_rtr_pref is enabled. + -1 if accept_ra_rtr_pref is disabled. + accept_ra_rtr_pref - BOOLEAN Accept Router Preference in RA. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index c609cc70237..1263d8cb3c1 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -150,6 +150,9 @@ struct ipv6_devconf { #ifdef CONFIG_IPV6_ROUTER_PREF __s32 accept_ra_rtr_pref; __s32 rtr_probe_interval; +#ifdef CONFIG_IPV6_ROUTE_INFO + __s32 accept_ra_rt_info_max_plen; +#endif #endif void *sysctl; }; @@ -177,6 +180,7 @@ enum { DEVCONF_ACCEPT_RA_PINFO, DEVCONF_ACCEPT_RA_RTR_PREF, DEVCONF_RTR_PROBE_INTERVAL, + DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN, DEVCONF_MAX }; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index f49488ffefe..8ad4beab288 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -535,6 +535,7 @@ enum { NET_IPV6_ACCEPT_RA_PINFO=19, NET_IPV6_ACCEPT_RA_RTR_PREF=20, NET_IPV6_RTR_PROBE_INTERVAL=21, + NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN=22, __NET_IPV6_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e7add61e6e3..eb82cd5df8c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -170,6 +170,9 @@ struct ipv6_devconf ipv6_devconf = { #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, .rtr_probe_interval = 60 * HZ, +#ifdef CONFIG_IPV6_ROUTE_INFO + .accept_ra_rt_info_max_plen = 0, +#endif #endif }; @@ -197,6 +200,9 @@ static struct ipv6_devconf ipv6_devconf_dflt = { #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, .rtr_probe_interval = 60 * HZ, +#ifdef CONFIG_IPV6_ROUTE_INFO + .accept_ra_rt_info_max_plen = 0, +#endif #endif }; @@ -3133,6 +3139,9 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf, #ifdef CONFIG_IPV6_ROUTER_PREF array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; array[DEVCONF_RTR_PROBE_INTERVAL] = cnf->rtr_probe_interval; +#ifdef CONFIV_IPV6_ROUTE_INFO + array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen; +#endif #endif } @@ -3620,6 +3629,16 @@ static struct addrconf_sysctl_table .proc_handler = &proc_dointvec_jiffies, .strategy = &sysctl_jiffies, }, +#ifdef CONFIV_IPV6_ROUTE_INFO + { + .ctl_name = NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, + .procname = "accept_ra_rt_info_max_plen", + .data = &ipv6_devconf.accept_ra_rt_info_max_plen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #endif { .ctl_name = 0, /* sentinel */ diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 1f625690967..dfa20d3be9b 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1214,11 +1214,13 @@ skip_defrtr: } #ifdef CONFIG_IPV6_ROUTE_INFO - if (ndopts.nd_opts_ri) { + if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_ri; p; p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) { + if (((struct route_info *)p)->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) + continue; rt6_route_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3, &skb->nh.ipv6h->saddr); } -- cgit v1.2.3-70-g09d2 From 5d424d5a674f782d0659a3b66d951f412901faee Mon Sep 17 00:00:00 2001 From: John Heffner Date: Mon, 20 Mar 2006 17:53:41 -0800 Subject: [TCP]: MTU probing Implementation of packetization layer path mtu discovery for TCP, based on the internet-draft currently found at . Signed-off-by: John Heffner Signed-off-by: David S. Miller --- include/linux/sysctl.h | 2 + include/net/inet_connection_sock.h | 13 ++ include/net/tcp.h | 9 ++ net/ipv4/sysctl_net_ipv4.c | 16 +++ net/ipv4/tcp_input.c | 49 ++++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 236 ++++++++++++++++++++++++++++++++++--- net/ipv4/tcp_timer.c | 36 +++--- net/ipv6/tcp_ipv6.c | 1 + 9 files changed, 326 insertions(+), 37 deletions(-) (limited to 'include/linux/sysctl.h') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 8ad4beab288..6e8880ea49e 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -397,6 +397,8 @@ enum NET_TCP_CONG_CONTROL=110, NET_TCP_ABC=111, NET_IPV4_IPFRAG_MAX_DIST=112, + NET_TCP_MTU_PROBING=113, + NET_TCP_BASE_MSS=114, }; enum { diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index fa587c94e9d..b3abe33f4e5 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -72,6 +72,7 @@ struct inet_connection_sock_af_ops { * @icsk_probes_out: unanswered 0 window probes * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) * @icsk_ack: Delayed ACK control data + * @icsk_mtup; MTU probing control data */ struct inet_connection_sock { /* inet_sock has to be the first member! */ @@ -104,6 +105,18 @@ struct inet_connection_sock { __u16 last_seg_size; /* Size of last incoming segment */ __u16 rcv_mss; /* MSS used for delayed ACK decisions */ } icsk_ack; + struct { + int enabled; + + /* Range of MTUs to search */ + int search_high; + int search_low; + + /* Information on the current probe. */ + int probe_size; + __u32 probe_seq_start; + __u32 probe_seq_end; + } icsk_mtup; u32 icsk_ca_priv[16]; #define ICSK_CA_PRIV_SIZE (16 * sizeof(u32)) }; diff --git a/include/net/tcp.h b/include/net/tcp.h index 77f21c65bbc..16879fa560d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -60,6 +60,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); /* Minimal RCV_MSS. */ #define TCP_MIN_RCVMSS 536U +/* The least MTU to use for probing */ +#define TCP_BASE_MSS 512 + /* After receiving this amount of duplicate ACKs fast retransmit starts. */ #define TCP_FASTRETRANS_THRESH 3 @@ -219,6 +222,8 @@ extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_abc; +extern int sysctl_tcp_mtu_probing; +extern int sysctl_tcp_base_mss; extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; @@ -447,6 +452,10 @@ extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, extern void tcp_initialize_rcv_mss(struct sock *sk); +extern int tcp_mtu_to_mss(struct sock *sk, int pmtu); +extern int tcp_mss_to_mtu(struct sock *sk, int mss); +extern void tcp_mtup_init(struct sock *sk); + static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { tp->pred_flags = htonl((tp->tcp_header_len << 26) | diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 16984d4a8a0..ebf2e0b363c 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -664,6 +664,22 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = NET_TCP_MTU_PROBING, + .procname = "tcp_mtu_probing", + .data = &sysctl_tcp_mtu_probing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BASE_MSS, + .procname = "tcp_base_mss", + .data = &sysctl_tcp_base_mss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e9a54ae7d69..0ac388e3d01 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1891,6 +1891,34 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) } } +static void tcp_mtup_probe_failed(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; + icsk->icsk_mtup.probe_size = 0; +} + +static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + /* FIXME: breaks with very large cwnd */ + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_cwnd = tp->snd_cwnd * + tcp_mss_to_mtu(sk, tp->mss_cache) / + icsk->icsk_mtup.probe_size; + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->rcv_ssthresh = tcp_current_ssthresh(sk); + + icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; + icsk->icsk_mtup.probe_size = 0; + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); +} + + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2023,6 +2051,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, return; } + /* MTU probe failure: don't reduce cwnd */ + if (icsk->icsk_ca_state < TCP_CA_CWR && + icsk->icsk_mtup.probe_size && + tp->snd_una == icsk->icsk_mtup.probe_seq_start) { + tcp_mtup_probe_failed(sk); + /* Restores the reduction we did in tcp_mtup_probe() */ + tp->snd_cwnd++; + tcp_simple_retransmit(sk); + return; + } + /* Otherwise enter Recovery state */ if (IsReno(tp)) @@ -2243,6 +2282,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tp->retrans_stamp = 0; } + /* MTU probing checks */ + if (icsk->icsk_mtup.probe_size) { + if (!after(icsk->icsk_mtup.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) { + tcp_mtup_probe_success(sk, skb); + } + } + if (sacked) { if (sacked & TCPCB_RETRANS) { if(sacked & TCPCB_SACKED_RETRANS) @@ -4101,6 +4147,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.sack_ok && sysctl_tcp_fack) tp->rx_opt.sack_ok |= 2; + tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -4211,6 +4258,7 @@ discard: if (tp->ecn_flags&TCP_ECN_OK) sock_set_flag(sk, SOCK_NO_LARGESEND); + tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -4399,6 +4447,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ tp->lsndtime = tcp_time_stamp; + tcp_mtup_init(sk); tcp_initialize_rcv_mss(sk); tcp_init_buffer_space(sk); tcp_fast_path_on(tp); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 233bdf25996..57e7a26e821 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -900,6 +900,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; newinet->id = newtp->write_seq ^ jiffies; + tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(newsk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9f498a6c889..8197b5e12f1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -51,6 +51,12 @@ int sysctl_tcp_retrans_collapse = 1; */ int sysctl_tcp_tso_win_divisor = 3; +int sysctl_tcp_mtu_probing = 0; +int sysctl_tcp_base_mss = 512; + +EXPORT_SYMBOL(sysctl_tcp_mtu_probing); +EXPORT_SYMBOL(sysctl_tcp_base_mss); + static void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { @@ -681,6 +687,62 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) return 0; } +/* Not accounting for SACKs here. */ +int tcp_mtu_to_mss(struct sock *sk, int pmtu) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int mss_now; + + /* Calculate base mss without TCP options: + It is MMS_S - sizeof(tcphdr) of rfc1122 + */ + mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); + + /* Clamp it (mss_clamp does not include tcp options) */ + if (mss_now > tp->rx_opt.mss_clamp) + mss_now = tp->rx_opt.mss_clamp; + + /* Now subtract optional transport overhead */ + mss_now -= icsk->icsk_ext_hdr_len; + + /* Then reserve room for full set of TCP options and 8 bytes of data */ + if (mss_now < 48) + mss_now = 48; + + /* Now subtract TCP options size, not including SACKs */ + mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + + return mss_now; +} + +/* Inverse of above */ +int tcp_mss_to_mtu(struct sock *sk, int mss) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int mtu; + + mtu = mss + + tp->tcp_header_len + + icsk->icsk_ext_hdr_len + + icsk->icsk_af_ops->net_header_len; + + return mtu; +} + +void tcp_mtup_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; + icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + + icsk->icsk_af_ops->net_header_len; + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); + icsk->icsk_mtup.probe_size = 0; +} + /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts @@ -708,25 +770,12 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - /* Calculate base mss without TCP options: - It is MMS_S - sizeof(tcphdr) of rfc1122 - */ - int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - - sizeof(struct tcphdr)); + int mss_now; - /* Clamp it (mss_clamp does not include tcp options) */ - if (mss_now > tp->rx_opt.mss_clamp) - mss_now = tp->rx_opt.mss_clamp; + if (icsk->icsk_mtup.search_high > pmtu) + icsk->icsk_mtup.search_high = pmtu; - /* Now subtract optional transport overhead */ - mss_now -= icsk->icsk_ext_hdr_len; - - /* Then reserve room for full set of TCP options and 8 bytes of data */ - if (mss_now < 48) - mss_now = 48; - - /* Now subtract TCP options size, not including SACKs */ - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + mss_now = tcp_mtu_to_mss(sk, pmtu); /* Bound mss with half of window */ if (tp->max_window && mss_now > (tp->max_window>>1)) @@ -734,6 +783,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; + if (icsk->icsk_mtup.enabled) + mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); tp->mss_cache = mss_now; return mss_now; @@ -1063,6 +1114,140 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_ return 1; } +/* Create a new MTU probe if we are ready. + * Returns 0 if we should wait to probe (no cwnd available), + * 1 if a probe was sent, + * -1 otherwise */ +static int tcp_mtu_probe(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb, *nskb, *next; + int len; + int probe_size; + unsigned int pif; + int copy; + int mss_now; + + /* Not currently probing/verifying, + * not in recovery, + * have enough cwnd, and + * not SACKing (the variable headers throw things off) */ + if (!icsk->icsk_mtup.enabled || + icsk->icsk_mtup.probe_size || + inet_csk(sk)->icsk_ca_state != TCP_CA_Open || + tp->snd_cwnd < 11 || + tp->rx_opt.eff_sacks) + return -1; + + /* Very simple search strategy: just double the MSS. */ + mss_now = tcp_current_mss(sk, 0); + probe_size = 2*tp->mss_cache; + if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { + /* TODO: set timer for probe_converge_event */ + return -1; + } + + /* Have enough data in the send queue to probe? */ + len = 0; + if ((skb = sk->sk_send_head) == NULL) + return -1; + while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb)) + skb = skb->next; + if (len < probe_size) + return -1; + + /* Receive window check. */ + if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) { + if (tp->snd_wnd < probe_size) + return -1; + else + return 0; + } + + /* Do we need to wait to drain cwnd? */ + pif = tcp_packets_in_flight(tp); + if (pif + 2 > tp->snd_cwnd) { + /* With no packets in flight, don't stall. */ + if (pif == 0) + return -1; + else + return 0; + } + + /* We're allowed to probe. Build it now. */ + if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) + return -1; + sk_charge_skb(sk, nskb); + + skb = sk->sk_send_head; + __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue); + sk->sk_send_head = nskb; + + TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; + TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(nskb)->sacked = 0; + nskb->csum = 0; + if (skb->ip_summed == CHECKSUM_HW) + nskb->ip_summed = CHECKSUM_HW; + + len = 0; + while (len < probe_size) { + next = skb->next; + + copy = min_t(int, skb->len, probe_size - len); + if (nskb->ip_summed) + skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); + else + nskb->csum = skb_copy_and_csum_bits(skb, 0, + skb_put(nskb, copy), copy, nskb->csum); + + if (skb->len <= copy) { + /* We've eaten all the data from this skb. + * Throw it away. */ + TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; + __skb_unlink(skb, &sk->sk_write_queue); + sk_stream_free_skb(sk, skb); + } else { + TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & + ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + if (!skb_shinfo(skb)->nr_frags) { + skb_pull(skb, copy); + if (skb->ip_summed != CHECKSUM_HW) + skb->csum = csum_partial(skb->data, skb->len, 0); + } else { + __pskb_trim_head(skb, copy); + tcp_set_skb_tso_segs(sk, skb, mss_now); + } + TCP_SKB_CB(skb)->seq += copy; + } + + len += copy; + skb = next; + } + tcp_init_tso_segs(sk, nskb, nskb->len); + + /* We're ready to send. If this fails, the probe will + * be resegmented into mss-sized pieces by tcp_write_xmit(). */ + TCP_SKB_CB(nskb)->when = tcp_time_stamp; + if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { + /* Decrement cwnd here because we are sending + * effectively two packets. */ + tp->snd_cwnd--; + update_send_head(sk, tp, nskb); + + icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); + icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq; + icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; + + return 1; + } + + return -1; +} + + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -1076,6 +1261,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) struct sk_buff *skb; unsigned int tso_segs, sent_pkts; int cwnd_quota; + int result; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all @@ -1085,6 +1271,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) return 0; sent_pkts = 0; + + /* Do MTU probing. */ + if ((result = tcp_mtu_probe(sk)) == 0) { + return 0; + } else if (result > 0) { + sent_pkts = 1; + } + while ((skb = sk->sk_send_head)) { unsigned int limit; @@ -1455,9 +1649,15 @@ void tcp_simple_retransmit(struct sock *sk) int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); unsigned int cur_mss = tcp_current_mss(sk, 0); int err; + /* Inconslusive MTU probe */ + if (icsk->icsk_mtup.probe_size) { + icsk->icsk_mtup.probe_size = 0; + } + /* Do not sent more than we queued. 1/4 is reserved for possible * copying overhead: fragmentation, tunneling, mangling etc. */ @@ -1883,6 +2083,7 @@ static void tcp_connect_init(struct sock *sk) if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; tp->max_window = 0; + tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); if (!tp->window_clamp) @@ -2180,3 +2381,4 @@ EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_sync_mss); EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); +EXPORT_SYMBOL(tcp_mtup_init); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index e1880959614..7c1bde3cd6c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -119,8 +119,10 @@ static int tcp_orphan_retries(struct sock *sk, int alive) /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); int retry_until; + int mss; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) @@ -128,25 +130,19 @@ static int tcp_write_timeout(struct sock *sk) retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; } else { if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { - /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black - hole detection. :-( - - It is place to make it. It is not made. I do not want - to make it. It is disgusting. It does not work in any - case. Let me to cite the same draft, which requires for - us to implement this: - - "The one security concern raised by this memo is that ICMP black holes - are often caused by over-zealous security administrators who block - all ICMP messages. It is vitally important that those who design and - deploy security systems understand the impact of strict filtering on - upper-layer protocols. The safest web site in the world is worthless - if most TCP implementations cannot transfer data from it. It would - be far nicer to have all of the black holes fixed rather than fixing - all of the TCP implementations." - - Golden words :-). - */ + /* Black hole detection */ + if (sysctl_tcp_mtu_probing) { + if (!icsk->icsk_mtup.enabled) { + icsk->icsk_mtup.enabled = 1; + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + } else { + mss = min(sysctl_tcp_base_mss, + tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2); + mss = max(mss, 68 - tp->tcp_header_len); + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + } + } dst_negative_advice(&sk->sk_dst_cache); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ca9cf685375..14de50380f4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -987,6 +987,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + newnp->opt->opt_flen); + tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(newsk); -- cgit v1.2.3-70-g09d2 From f8cd54884e675dfaf0c86cc7c088adb6ca9d7638 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 20 Mar 2006 19:15:11 -0800 Subject: [IPSEC]: Sync series - core changes This patch provides the core functionality needed for sync events for ipsec. Derived work of Krisztian KOVACS Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/linux/sysctl.h | 2 ++ include/linux/xfrm.h | 30 ++++++++++++++++++ include/net/xfrm.h | 44 ++++++++++++++++++++++++++- net/core/sysctl_net_core.c | 23 ++++++++++++++ net/xfrm/xfrm_state.c | 76 +++++++++++++++++++++++++++++++++++++++++++++- net/xfrm/xfrm_user.c | 4 ++- 6 files changed, 176 insertions(+), 3 deletions(-) (limited to 'include/linux/sysctl.h') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 6e8880ea49e..b686548f32e 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -261,6 +261,8 @@ enum NET_CORE_DEV_WEIGHT=17, NET_CORE_SOMAXCONN=18, NET_CORE_BUDGET=19, + NET_CORE_AEVENT_ETIME=20, + NET_CORE_AEVENT_RSEQTH=21, }; /* /proc/sys/net/ethernet */ diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index 82fbb758e28..b54a12940ef 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -156,6 +156,10 @@ enum { XFRM_MSG_FLUSHPOLICY, #define XFRM_MSG_FLUSHPOLICY XFRM_MSG_FLUSHPOLICY + XFRM_MSG_NEWAE, +#define XFRM_MSG_NEWAE XFRM_MSG_NEWAE + XFRM_MSG_GETAE, +#define XFRM_MSG_GETAE XFRM_MSG_GETAE __XFRM_MSG_MAX }; #define XFRM_MSG_MAX (__XFRM_MSG_MAX - 1) @@ -194,6 +198,21 @@ struct xfrm_encap_tmpl { xfrm_address_t encap_oa; }; +/* AEVENT flags */ +enum xfrm_ae_ftype_t { + XFRM_AE_UNSPEC, + XFRM_AE_RTHR=1, /* replay threshold*/ + XFRM_AE_RVAL=2, /* replay value */ + XFRM_AE_LVAL=4, /* lifetime value */ + XFRM_AE_ETHR=8, /* expiry timer threshold */ + XFRM_AE_CR=16, /* Event cause is replay update */ + XFRM_AE_CE=32, /* Event cause is timer expiry */ + XFRM_AE_CU=64, /* Event cause is policy update */ + __XFRM_AE_MAX + +#define XFRM_AE_MAX (__XFRM_AE_MAX - 1) +}; + /* Netlink message attributes. */ enum xfrm_attr_type_t { XFRMA_UNSPEC, @@ -205,6 +224,10 @@ enum xfrm_attr_type_t { XFRMA_SA, XFRMA_POLICY, XFRMA_SEC_CTX, /* struct xfrm_sec_ctx */ + XFRMA_LTIME_VAL, + XFRMA_REPLAY_VAL, + XFRMA_REPLAY_THRESH, + XFRMA_ETIMER_THRESH, __XFRMA_MAX #define XFRMA_MAX (__XFRMA_MAX - 1) @@ -235,6 +258,11 @@ struct xfrm_usersa_id { __u8 proto; }; +struct xfrm_aevent_id { + __u32 flags; + struct xfrm_usersa_id sa_id; +}; + struct xfrm_userspi_info { struct xfrm_usersa_info info; __u32 min; @@ -306,6 +334,8 @@ enum xfrm_nlgroups { #define XFRMNLGRP_SA XFRMNLGRP_SA XFRMNLGRP_POLICY, #define XFRMNLGRP_POLICY XFRMNLGRP_POLICY + XFRMNLGRP_AEVENTS, +#define XFRMNLGRP_AEVENTS XFRMNLGRP_AEVENTS __XFRMNLGRP_MAX }; #define XFRMNLGRP_MAX (__XFRMNLGRP_MAX - 1) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 8d362c49b8a..bc005e62e43 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -20,6 +20,10 @@ #define XFRM_ALIGN8(len) (((len) + 7) & ~7) +extern struct sock *xfrm_nl; +extern u32 sysctl_xfrm_aevent_etime; +extern u32 sysctl_xfrm_aevent_rseqth; + extern struct semaphore xfrm_cfg_sem; /* Organization of SPD aka "XFRM rules" @@ -135,6 +139,16 @@ struct xfrm_state /* State for replay detection */ struct xfrm_replay_state replay; + /* Replay detection state at the time we sent the last notification */ + struct xfrm_replay_state preplay; + + /* Replay detection notification settings */ + u32 replay_maxage; + u32 replay_maxdiff; + + /* Replay detection notification timer */ + struct timer_list rtimer; + /* Statistics */ struct xfrm_stats stats; @@ -169,6 +183,7 @@ struct km_event u32 hard; u32 proto; u32 byid; + u32 aevent; } data; u32 seq; @@ -305,7 +320,21 @@ struct xfrm_policy struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; }; -#define XFRM_KM_TIMEOUT 30 +#define XFRM_KM_TIMEOUT 30 +/* which seqno */ +#define XFRM_REPLAY_SEQ 1 +#define XFRM_REPLAY_OSEQ 2 +#define XFRM_REPLAY_SEQ_MASK 3 +/* what happened */ +#define XFRM_REPLAY_UPDATE XFRM_AE_CR +#define XFRM_REPLAY_TIMEOUT XFRM_AE_CE + +/* default aevent timeout in units of 100ms */ +#define XFRM_AE_ETIME 10 +/* Async Event timer multiplier */ +#define XFRM_AE_ETH_M 10 +/* default seq threshold size */ +#define XFRM_AE_SEQT_SIZE 2 struct xfrm_mgr { @@ -865,6 +894,7 @@ extern int xfrm_state_delete(struct xfrm_state *x); extern void xfrm_state_flush(u8 proto); extern int xfrm_replay_check(struct xfrm_state *x, u32 seq); extern void xfrm_replay_advance(struct xfrm_state *x, u32 seq); +extern void xfrm_replay_notify(struct xfrm_state *x, int event); extern int xfrm_state_check(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm_state_mtu(struct xfrm_state *x, int mtu); extern int xfrm_init_state(struct xfrm_state *x); @@ -965,4 +995,16 @@ static inline int xfrm_policy_id2dir(u32 index) return index & 7; } +static inline int xfrm_aevent_is_on(void) +{ + return netlink_has_listeners(xfrm_nl,XFRMNLGRP_AEVENTS); +} + +static inline void xfrm_aevent_doreplay(struct xfrm_state *x) +{ + if (xfrm_aevent_is_on()) + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); +} + + #endif /* _NET_XFRM_H */ diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 2f278c8e474..71045365672 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -26,6 +26,11 @@ extern int sysctl_core_destroy_delay; extern char sysctl_divert_version[]; #endif /* CONFIG_NET_DIVERT */ +#ifdef CONFIG_XFRM +extern u32 sysctl_xfrm_aevent_etime; +extern u32 sysctl_xfrm_aevent_rseqth; +#endif + ctl_table core_table[] = { #ifdef CONFIG_NET { @@ -111,6 +116,24 @@ ctl_table core_table[] = { .proc_handler = &proc_dostring }, #endif /* CONFIG_NET_DIVERT */ +#ifdef CONFIG_XFRM + { + .ctl_name = NET_CORE_AEVENT_ETIME, + .procname = "xfrm_aevent_etime", + .data = &sysctl_xfrm_aevent_etime, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_AEVENT_RSEQTH, + .procname = "xfrm_aevent_rseqth", + .data = &sysctl_xfrm_aevent_rseqth, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#endif /* CONFIG_XFRM */ #endif /* CONFIG_NET */ { .ctl_name = NET_CORE_SOMAXCONN, diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index c656cbaf35e..8eaee499cad 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -20,6 +20,8 @@ #include #include +u32 sysctl_xfrm_aevent_etime = XFRM_AE_ETIME; +u32 sysctl_xfrm_aevent_rseqth = XFRM_AE_SEQT_SIZE; /* Each xfrm_state may be linked to two tables: 1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl) @@ -62,6 +64,8 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) { if (del_timer(&x->timer)) BUG(); + if (del_timer(&x->rtimer)) + BUG(); kfree(x->aalg); kfree(x->ealg); kfree(x->calg); @@ -190,11 +194,16 @@ struct xfrm_state *xfrm_state_alloc(void) init_timer(&x->timer); x->timer.function = xfrm_timer_handler; x->timer.data = (unsigned long)x; + init_timer(&x->rtimer); + x->rtimer.function = xfrm_replay_timer_handler; + x->rtimer.data = (unsigned long)x; x->curlft.add_time = (unsigned long)xtime.tv_sec; x->lft.soft_byte_limit = XFRM_INF; x->lft.soft_packet_limit = XFRM_INF; x->lft.hard_byte_limit = XFRM_INF; x->lft.hard_packet_limit = XFRM_INF; + x->replay_maxage = 0; + x->replay_maxdiff = 0; spin_lock_init(&x->lock); } return x; @@ -228,6 +237,8 @@ static int __xfrm_state_delete(struct xfrm_state *x) spin_unlock(&xfrm_state_lock); if (del_timer(&x->timer)) __xfrm_state_put(x); + if (del_timer(&x->rtimer)) + __xfrm_state_put(x); /* The number two in this test is the reference * mentioned in the comment below plus the reference @@ -426,6 +437,10 @@ static void __xfrm_state_insert(struct xfrm_state *x) if (!mod_timer(&x->timer, jiffies + HZ)) xfrm_state_hold(x); + if (x->replay_maxage && + !mod_timer(&x->rtimer, jiffies + x->replay_maxage)) + xfrm_state_hold(x); + wake_up(&km_waitq); } @@ -762,6 +777,62 @@ out: } EXPORT_SYMBOL(xfrm_state_walk); + +void xfrm_replay_notify(struct xfrm_state *x, int event) +{ + struct km_event c; + /* we send notify messages in case + * 1. we updated on of the sequence numbers, and the seqno difference + * is at least x->replay_maxdiff, in this case we also update the + * timeout of our timer function + * 2. if x->replay_maxage has elapsed since last update, + * and there were changes + * + * The state structure must be locked! + */ + + switch (event) { + case XFRM_REPLAY_UPDATE: + if (x->replay_maxdiff && + (x->replay.seq - x->preplay.seq < x->replay_maxdiff) && + (x->replay.oseq - x->preplay.oseq < x->replay_maxdiff)) + return; + + break; + + case XFRM_REPLAY_TIMEOUT: + if ((x->replay.seq == x->preplay.seq) && + (x->replay.bitmap == x->preplay.bitmap) && + (x->replay.oseq == x->preplay.oseq)) + return; + + break; + } + + memcpy(&x->preplay, &x->replay, sizeof(struct xfrm_replay_state)); + c.event = XFRM_MSG_NEWAE; + c.data.aevent = event; + km_state_notify(x, &c); + +resched: + if (x->replay_maxage && + !mod_timer(&x->rtimer, jiffies + x->replay_maxage)) + xfrm_state_hold(x); + +} + +static void xfrm_replay_timer_handler(unsigned long data) +{ + struct xfrm_state *x = (struct xfrm_state*)data; + + spin_lock(&x->lock); + + if (xfrm_aevent_is_on() && x->km.state == XFRM_STATE_VALID) + xfrm_replay_notify(x, XFRM_REPLAY_TIMEOUT); + + spin_unlock(&x->lock); +} + int xfrm_replay_check(struct xfrm_state *x, u32 seq) { u32 diff; @@ -805,6 +876,9 @@ void xfrm_replay_advance(struct xfrm_state *x, u32 seq) diff = x->replay.seq - seq; x->replay.bitmap |= (1U << diff); } + + if (xfrm_aevent_is_on()) + xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); } EXPORT_SYMBOL(xfrm_replay_advance); @@ -835,7 +909,7 @@ void km_state_notify(struct xfrm_state *x, struct km_event *c) EXPORT_SYMBOL(km_policy_notify); EXPORT_SYMBOL(km_state_notify); -static void km_state_expired(struct xfrm_state *x, int hard) +void km_state_expired(struct xfrm_state *x, int hard) { struct km_event c; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 7de17559249..6f643e58e04 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -28,7 +28,7 @@ #include #include -static struct sock *xfrm_nl; +struct sock *xfrm_nl; static int verify_one_alg(struct rtattr **xfrma, enum xfrm_attr_type_t type) { @@ -1618,3 +1618,5 @@ module_init(xfrm_user_init); module_exit(xfrm_user_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM); +EXPORT_SYMBOL(xfrm_nl); + -- cgit v1.2.3-70-g09d2 From e55d912f5b75723159348a7fc7692f869a86636a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 20 Mar 2006 19:25:02 -0800 Subject: [DCCP] feat: Introduce sysctls for the default features [root@qemu ~]# for a in /proc/sys/net/dccp/default/* ; do echo $a ; cat $a ; done /proc/sys/net/dccp/default/ack_ratio 2 /proc/sys/net/dccp/default/rx_ccid 3 /proc/sys/net/dccp/default/send_ackvec 1 /proc/sys/net/dccp/default/send_ndp 1 /proc/sys/net/dccp/default/seq_window 100 /proc/sys/net/dccp/default/tx_ccid 3 [root@qemu ~]# So if wanting to test ccid3 as the tx CCID one can just do: [root@qemu ~]# echo 3 > /proc/sys/net/dccp/default/tx_ccid [root@qemu ~]# echo 2 > /proc/sys/net/dccp/default/rx_ccid [root@qemu ~]# cat /proc/sys/net/dccp/default/[tr]x_ccid 2 3 [root@qemu ~]# Of course we also need the setsockopt for each app to tell its preferences, but for testing or defining something other than CCID2 as the default for apps that don't explicitely set their preference the sysctl interface is handy. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/sysctl.h | 16 +++++++ net/dccp/Makefile | 2 + net/dccp/dccp.h | 14 ++++++ net/dccp/options.c | 22 +++++---- net/dccp/proto.c | 9 +++- net/dccp/sysctl.c | 124 +++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 176 insertions(+), 11 deletions(-) create mode 100644 net/dccp/sysctl.c (limited to 'include/linux/sysctl.h') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index b686548f32e..dfcf449afc7 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -211,6 +211,7 @@ enum NET_SCTP=17, NET_LLC=18, NET_NETFILTER=19, + NET_DCCP=20, }; /* /proc/sys/kernel/random */ @@ -571,6 +572,21 @@ enum { __NET_NEIGH_MAX }; +/* /proc/sys/net/dccp */ +enum { + NET_DCCP_DEFAULT=1, +}; + +/* /proc/sys/net/dccp/default */ +enum { + NET_DCCP_DEFAULT_SEQ_WINDOW = 1, + NET_DCCP_DEFAULT_RX_CCID = 2, + NET_DCCP_DEFAULT_TX_CCID = 3, + NET_DCCP_DEFAULT_ACK_RATIO = 4, + NET_DCCP_DEFAULT_SEND_ACKVEC = 5, + NET_DCCP_DEFAULT_SEND_NDP = 6, +}; + /* /proc/sys/net/ipx */ enum { NET_IPX_PPROP_BROADCASTING=1, diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 5736acea1c8..7af0569fe4c 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -11,6 +11,8 @@ dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o +dccp-$(CONFIG_SYSCTL) += sysctl.o + dccp_diag-y := diag.o obj-y += ccids/ diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 1764adb4f15..f059541f5a1 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -433,4 +433,18 @@ static inline void timeval_sub_usecs(struct timeval *tv, } } +#ifdef CONFIG_SYSCTL +extern int dccp_sysctl_init(void); +extern void dccp_sysctl_exit(void); +#else +static inline int dccp_sysctl_init(void) +{ + return 0; +} + +static inline void dccp_sysctl_exit(void) +{ +} +#endif + #endif /* _DCCP_H */ diff --git a/net/dccp/options.c b/net/dccp/options.c index 7d73b33a604..3ecd319c0f5 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -23,19 +23,21 @@ #include "dccp.h" #include "feat.h" -/* stores the default values for new connection. may be changed with sysctl */ -static const struct dccp_options dccpo_default_values = { - .dccpo_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW, - .dccpo_rx_ccid = DCCPF_INITIAL_CCID, - .dccpo_tx_ccid = DCCPF_INITIAL_CCID, - .dccpo_ack_ratio = DCCPF_INITIAL_ACK_RATIO, - .dccpo_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR, - .dccpo_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT, -}; +int dccp_feat_default_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; +int dccp_feat_default_rx_ccid = DCCPF_INITIAL_CCID; +int dccp_feat_default_tx_ccid = DCCPF_INITIAL_CCID; +int dccp_feat_default_ack_ratio = DCCPF_INITIAL_ACK_RATIO; +int dccp_feat_default_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; +int dccp_feat_default_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; void dccp_options_init(struct dccp_options *dccpo) { - memcpy(dccpo, &dccpo_default_values, sizeof(*dccpo)); + dccpo->dccpo_sequence_window = dccp_feat_default_sequence_window; + dccpo->dccpo_rx_ccid = dccp_feat_default_rx_ccid; + dccpo->dccpo_tx_ccid = dccp_feat_default_tx_ccid; + dccpo->dccpo_ack_ratio = dccp_feat_default_ack_ratio; + dccpo->dccpo_send_ack_vector = dccp_feat_default_send_ack_vector; + dccpo->dccpo_send_ndp_count = dccp_feat_default_send_ndp_count; } static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 53735ee2bbd..6403e9306dd 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -934,11 +934,17 @@ static int __init dccp_init(void) if (rc) goto out_unregister_protosw; - rc = dccp_ctl_sock_init(); + rc = dccp_sysctl_init(); if (rc) goto out_ackvec_exit; + + rc = dccp_ctl_sock_init(); + if (rc) + goto out_sysctl_exit; out: return rc; +out_sysctl_exit: + dccp_sysctl_exit(); out_ackvec_exit: dccp_ackvec_exit(); out_unregister_protosw: @@ -983,6 +989,7 @@ static void __exit dccp_fini(void) kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); proto_unregister(&dccp_prot); dccp_ackvec_exit(); + dccp_sysctl_exit(); } module_init(dccp_init); diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c new file mode 100644 index 00000000000..64c89e9c229 --- /dev/null +++ b/net/dccp/sysctl.c @@ -0,0 +1,124 @@ +/* + * net/dccp/sysctl.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License v2 + * as published by the Free Software Foundation. + */ + +#include +#include +#include + +#ifndef CONFIG_SYSCTL +#error This file should not be compiled without CONFIG_SYSCTL defined +#endif + +extern int dccp_feat_default_sequence_window; +extern int dccp_feat_default_rx_ccid; +extern int dccp_feat_default_tx_ccid; +extern int dccp_feat_default_ack_ratio; +extern int dccp_feat_default_send_ack_vector; +extern int dccp_feat_default_send_ndp_count; + +static struct ctl_table dccp_default_table[] = { + { + .ctl_name = NET_DCCP_DEFAULT_SEQ_WINDOW, + .procname = "seq_window", + .data = &dccp_feat_default_sequence_window, + .maxlen = sizeof(dccp_feat_default_sequence_window), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .ctl_name = NET_DCCP_DEFAULT_RX_CCID, + .procname = "rx_ccid", + .data = &dccp_feat_default_rx_ccid, + .maxlen = sizeof(dccp_feat_default_rx_ccid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .ctl_name = NET_DCCP_DEFAULT_TX_CCID, + .procname = "tx_ccid", + .data = &dccp_feat_default_tx_ccid, + .maxlen = sizeof(dccp_feat_default_tx_ccid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .ctl_name = NET_DCCP_DEFAULT_ACK_RATIO, + .procname = "ack_ratio", + .data = &dccp_feat_default_ack_ratio, + .maxlen = sizeof(dccp_feat_default_ack_ratio), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .ctl_name = NET_DCCP_DEFAULT_SEND_ACKVEC, + .procname = "send_ackvec", + .data = &dccp_feat_default_send_ack_vector, + .maxlen = sizeof(dccp_feat_default_send_ack_vector), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .ctl_name = NET_DCCP_DEFAULT_SEND_NDP, + .procname = "send_ndp", + .data = &dccp_feat_default_send_ndp_count, + .maxlen = sizeof(dccp_feat_default_send_ndp_count), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .ctl_name = 0, } +}; + +static struct ctl_table dccp_table[] = { + { + .ctl_name = NET_DCCP_DEFAULT, + .procname = "default", + .mode = 0555, + .child = dccp_default_table, + }, + { .ctl_name = 0, }, +}; + +static struct ctl_table dccp_dir_table[] = { + { + .ctl_name = NET_DCCP, + .procname = "dccp", + .mode = 0555, + .child = dccp_table, + }, + { .ctl_name = 0, }, +}; + +static struct ctl_table dccp_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = dccp_dir_table, + }, + { .ctl_name = 0, }, +}; + +static struct ctl_table_header *dccp_table_header; + +int __init dccp_sysctl_init(void) +{ + dccp_table_header = register_sysctl_table(dccp_root_table, 1); + + return dccp_table_header != NULL ? 0 : -ENOMEM; +} + +void dccp_sysctl_exit(void) +{ + if (dccp_table_header != NULL) { + unregister_sysctl_table(dccp_table_header); + dccp_table_header = NULL; + } +} -- cgit v1.2.3-70-g09d2 From abd596a4b68b6526c2676233e10602dd9660e9d7 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 20 Mar 2006 22:39:47 -0800 Subject: [IPV4] ARP: Alloc acceptance of unsolicited ARP via netdevice sysctl. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 1 + include/linux/sysctl.h | 1 + net/ipv4/arp.c | 20 ++++++++++---------- net/ipv4/devinet.c | 8 ++++++++ 4 files changed, 20 insertions(+), 10 deletions(-) (limited to 'include/linux/sysctl.h') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index fd7af86151b..92297ff24e8 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -25,6 +25,7 @@ struct ipv4_devconf int arp_filter; int arp_announce; int arp_ignore; + int arp_accept; int medium_id; int no_xfrm; int no_policy; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index dfcf449afc7..8754568a75d 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -456,6 +456,7 @@ enum NET_IPV4_CONF_ARP_ANNOUNCE=18, NET_IPV4_CONF_ARP_IGNORE=19, NET_IPV4_CONF_PROMOTE_SECONDARIES=20, + NET_IPV4_CONF_ARP_ACCEPT=21, __NET_IPV4_CONF_MAX }; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index accdefedfed..041dadde31a 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -879,16 +879,16 @@ static int arp_process(struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); -#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP - /* Unsolicited ARP is not accepted by default. - It is possible, that this option should be enabled for some - devices (strip is candidate) - */ - if (n == NULL && - arp->ar_op == htons(ARPOP_REPLY) && - inet_addr_type(sip) == RTN_UNICAST) - n = __neigh_lookup(&arp_tbl, &sip, dev, -1); -#endif + if (ipv4_devconf.arp_accept) { + /* Unsolicited ARP is not accepted by default. + It is possible, that this option should be enabled for some + devices (strip is candidate) + */ + if (n == NULL && + arp->ar_op == htons(ARPOP_REPLY) && + inet_addr_type(sip) == RTN_UNICAST) + n = __neigh_lookup(&arp_tbl, &sip, dev, -1); + } if (n) { int state = NUD_REACHABLE; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 3ffa60dadc0..44fdf1413e2 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1393,6 +1393,14 @@ static struct devinet_sysctl_table { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = NET_IPV4_CONF_ARP_ACCEPT, + .procname = "arp_accept", + .data = &ipv4_devconf.arp_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = NET_IPV4_CONF_NOXFRM, .procname = "disable_xfrm", -- cgit v1.2.3-70-g09d2 From 15d99e02babae8bc20b836917ace07d93e318149 Mon Sep 17 00:00:00 2001 From: Rick Jones Date: Mon, 20 Mar 2006 22:40:29 -0800 Subject: [TCP]: sysctl to allow TCP window > 32767 sans wscale Back in the dark ages, we had to be conservative and only allow 15-bit window fields if the window scale option was not negotiated. Some ancient stacks used a signed 16-bit quantity for the window field of the TCP header and would get confused. Those days are long gone, so we can use the full 16-bits by default now. There is a sysctl added so that we can still interact with such old stacks Signed-off-by: Rick Jones Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 +++++++ include/linux/sysctl.h | 1 + include/net/tcp.h | 1 + net/ipv4/sysctl_net_ipv4.c | 9 ++++++++- net/ipv4/tcp_output.c | 23 +++++++++++++++++------ 5 files changed, 34 insertions(+), 7 deletions(-) (limited to 'include/linux/sysctl.h') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 651298ba019..f12007b80a4 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -355,6 +355,13 @@ somaxconn - INTEGER Defaults to 128. See also tcp_max_syn_backlog for additional tuning for TCP sockets. +tcp_workaround_signed_windows - BOOLEAN + If set, assume no receipt of a window scaling option means the + remote TCP is broken and treats the window as a signed quantity. + If unset, assume the remote TCP is not broken even if we do + not receive a window scaling option from them. + Default: 0 + IP Variables: ip_local_port_range - 2 INTEGERS diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 8754568a75d..76eaeff76f8 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -402,6 +402,7 @@ enum NET_IPV4_IPFRAG_MAX_DIST=112, NET_TCP_MTU_PROBING=113, NET_TCP_BASE_MSS=114, + NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115, }; enum { diff --git a/include/net/tcp.h b/include/net/tcp.h index 16879fa560d..457e224de46 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -224,6 +224,7 @@ extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_abc; extern int sysctl_tcp_mtu_probing; extern int sysctl_tcp_base_mss; +extern int sysctl_tcp_workaround_signed_windows; extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ebf2e0b363c..6b6c3adfcf0 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -680,7 +680,14 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, - + { + .ctl_name = NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, + .procname = "tcp_workaround_signed_windows", + .data = &sysctl_tcp_workaround_signed_windows, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 518e568b53f..9d79546d384 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -45,6 +45,11 @@ /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse = 1; +/* People can turn this on to work with those rare, broken TCPs that + * interpret the window field as a signed quantity. + */ +int sysctl_tcp_workaround_signed_windows = 0; + /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. @@ -177,12 +182,18 @@ void tcp_select_initial_window(int __space, __u32 mss, space = (space / mss) * mss; /* NOTE: offering an initial window larger than 32767 - * will break some buggy TCP stacks. We try to be nice. - * If we are not window scaling, then this truncates - * our initial window offering to 32k. There should also - * be a sysctl option to stop being nice. + * will break some buggy TCP stacks. If the admin tells us + * it is likely we could be speaking with such a buggy stack + * we will truncate our initial window offering to 32K-1 + * unless the remote has sent us a window scaling option, + * which we interpret as a sign the remote TCP is not + * misinterpreting the window field as a signed quantity. */ - (*rcv_wnd) = min(space, MAX_TCP_WINDOW); + if (sysctl_tcp_workaround_signed_windows) + (*rcv_wnd) = min(space, MAX_TCP_WINDOW); + else + (*rcv_wnd) = space; + (*rcv_wscale) = 0; if (wscale_ok) { /* Set window scaling on max possible window @@ -241,7 +252,7 @@ static u16 tcp_select_window(struct sock *sk) /* Make sure we do not exceed the maximum possible * scaled window. */ - if (!tp->rx_opt.rcv_wscale) + if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); -- cgit v1.2.3-70-g09d2