From 7351a22a3ae005422488139365e9a80f560c80b9 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 5 Nov 2007 20:33:46 -0800 Subject: [NETFILTER]: ip{,6}_queue: convert to seq_file interface I plan to kill ->get_info which means killing proc_net_create(). Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_queue.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 10a2ce09fd8..14d64a383db 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -607,15 +608,11 @@ static ctl_table ipq_root_table[] = { { .ctl_name = 0 } }; -#ifdef CONFIG_PROC_FS -static int -ipq_get_info(char *buffer, char **start, off_t offset, int length) +static int ip_queue_show(struct seq_file *m, void *v) { - int len; - read_lock_bh(&queue_lock); - len = sprintf(buffer, + seq_printf(m, "Peer PID : %d\n" "Copy mode : %hu\n" "Copy range : %u\n" @@ -632,16 +629,21 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length) queue_user_dropped); read_unlock_bh(&queue_lock); + return 0; +} - *start = buffer + offset; - len -= offset; - if (len > length) - len = length; - else if (len < 0) - len = 0; - return len; +static int ip_queue_open(struct inode *inode, struct file *file) +{ + return single_open(file, ip_queue_show, NULL); } -#endif /* CONFIG_PROC_FS */ + +static const struct file_operations ip_queue_proc_fops = { + .open = ip_queue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .owner = THIS_MODULE, +}; static struct nf_queue_handler nfqh = { .name = "ip_queue", @@ -661,10 +663,11 @@ static int __init ip_queue_init(void) goto cleanup_netlink_notifier; } - proc = proc_net_create(&init_net, IPQ_PROC_FS_NAME, 0, ipq_get_info); - if (proc) + proc = create_proc_entry(IPQ_PROC_FS_NAME, 0, init_net.proc_net); + if (proc) { proc->owner = THIS_MODULE; - else { + proc->proc_fops = &ip_queue_proc_fops; + } else { printk(KERN_ERR "ip_queue: failed to create proc entry\n"); goto cleanup_ipqnl; } -- cgit v1.2.3-70-g09d2 From 0795c65d9f8de2bf9a62ae1f56e928c6b5ed75ab Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Mon, 5 Nov 2007 20:42:54 -0800 Subject: [NETFILTER]: Clean up Makefile Sort matches and targets in the NF makefiles. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Makefile | 20 ++++++++++---------- net/ipv6/netfilter/Makefile | 28 ++++++++++++++++------------ net/netfilter/Makefile | 14 +++++++------- 3 files changed, 33 insertions(+), 29 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 409d273f6f8..7456833d6ad 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -41,27 +41,27 @@ obj-$(CONFIG_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o # matches +obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o +obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o +obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o -obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o -obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o -obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o +obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o -obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o # targets -obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o -obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o +obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o +obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o -obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o +obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o +obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o -obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o -obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o -obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o +obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o +obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o # generic ARP tables obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 4513eab7739..e789ec44d23 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -4,25 +4,29 @@ # Link order matters here. obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o -obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o -obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o -obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o -obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o -obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o -obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o -obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o -obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o -obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o -obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o -obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o -obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o # objects for l3 independent conntrack nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o # l3 independent conntrack obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o + +# matches +obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o +obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o +obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o +obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o +obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o +obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o +obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o +obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o +obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o + +# targets +obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o +obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o +obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 93c58f97383..ad0e36ebea3 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -40,15 +40,15 @@ obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o # targets obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o -obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o +obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o -obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o -obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o # matches obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o @@ -59,22 +59,22 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o -obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o +obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o +obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o -obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o -obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o +obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o -obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o -- cgit v1.2.3-70-g09d2 From d1332e0ab84479d941de5cf4a69c71dfd385a25e Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 5 Nov 2007 20:43:30 -0800 Subject: [NETFILTER]: remove unneeded rcu_dereference() calls As noticed by Paul McKenney, the rcu_dereference calls in the init path of NAT modules are unneeded, remove them. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_nat_amanda.c | 2 +- net/ipv4/netfilter/nf_nat_ftp.c | 2 +- net/ipv4/netfilter/nf_nat_h323.c | 18 +++++++++--------- net/ipv4/netfilter/nf_nat_irc.c | 2 +- net/ipv4/netfilter/nf_nat_pptp.c | 8 ++++---- net/ipv4/netfilter/nf_nat_sip.c | 4 ++-- net/ipv4/netfilter/nf_nat_tftp.c | 2 +- 7 files changed, 19 insertions(+), 19 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index 35a5aa69cd9..c31b8766825 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c @@ -69,7 +69,7 @@ static void __exit nf_nat_amanda_fini(void) static int __init nf_nat_amanda_init(void) { - BUG_ON(rcu_dereference(nf_nat_amanda_hook)); + BUG_ON(nf_nat_amanda_hook != NULL); rcu_assign_pointer(nf_nat_amanda_hook, help); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c index e1a16d3ea4c..a1d5d58a58b 100644 --- a/net/ipv4/netfilter/nf_nat_ftp.c +++ b/net/ipv4/netfilter/nf_nat_ftp.c @@ -147,7 +147,7 @@ static void __exit nf_nat_ftp_fini(void) static int __init nf_nat_ftp_init(void) { - BUG_ON(rcu_dereference(nf_nat_ftp_hook)); + BUG_ON(nf_nat_ftp_hook != NULL); rcu_assign_pointer(nf_nat_ftp_hook, nf_nat_ftp); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index a868c8c4132..93e18ef114f 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -544,15 +544,15 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, /****************************************************************************/ static int __init init(void) { - BUG_ON(rcu_dereference(set_h245_addr_hook) != NULL); - BUG_ON(rcu_dereference(set_h225_addr_hook) != NULL); - BUG_ON(rcu_dereference(set_sig_addr_hook) != NULL); - BUG_ON(rcu_dereference(set_ras_addr_hook) != NULL); - BUG_ON(rcu_dereference(nat_rtp_rtcp_hook) != NULL); - BUG_ON(rcu_dereference(nat_t120_hook) != NULL); - BUG_ON(rcu_dereference(nat_h245_hook) != NULL); - BUG_ON(rcu_dereference(nat_callforwarding_hook) != NULL); - BUG_ON(rcu_dereference(nat_q931_hook) != NULL); + BUG_ON(set_h245_addr_hook != NULL); + BUG_ON(set_h225_addr_hook != NULL); + BUG_ON(set_sig_addr_hook != NULL); + BUG_ON(set_ras_addr_hook != NULL); + BUG_ON(nat_rtp_rtcp_hook != NULL); + BUG_ON(nat_t120_hook != NULL); + BUG_ON(nat_h245_hook != NULL); + BUG_ON(nat_callforwarding_hook != NULL); + BUG_ON(nat_q931_hook != NULL); rcu_assign_pointer(set_h245_addr_hook, set_h245_addr); rcu_assign_pointer(set_h225_addr_hook, set_h225_addr); diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c index 766e2c16c6b..fe6f9cef6c8 100644 --- a/net/ipv4/netfilter/nf_nat_irc.c +++ b/net/ipv4/netfilter/nf_nat_irc.c @@ -74,7 +74,7 @@ static void __exit nf_nat_irc_fini(void) static int __init nf_nat_irc_init(void) { - BUG_ON(rcu_dereference(nf_nat_irc_hook)); + BUG_ON(nf_nat_irc_hook != NULL); rcu_assign_pointer(nf_nat_irc_hook, help); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index e1385a09907..6817e7995f3 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -281,16 +281,16 @@ static int __init nf_nat_helper_pptp_init(void) { nf_nat_need_gre(); - BUG_ON(rcu_dereference(nf_nat_pptp_hook_outbound)); + BUG_ON(nf_nat_pptp_hook_outbound != NULL); rcu_assign_pointer(nf_nat_pptp_hook_outbound, pptp_outbound_pkt); - BUG_ON(rcu_dereference(nf_nat_pptp_hook_inbound)); + BUG_ON(nf_nat_pptp_hook_inbound != NULL); rcu_assign_pointer(nf_nat_pptp_hook_inbound, pptp_inbound_pkt); - BUG_ON(rcu_dereference(nf_nat_pptp_hook_exp_gre)); + BUG_ON(nf_nat_pptp_hook_exp_gre != NULL); rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, pptp_exp_gre); - BUG_ON(rcu_dereference(nf_nat_pptp_hook_expectfn)); + BUG_ON(nf_nat_pptp_hook_expectfn != NULL); rcu_assign_pointer(nf_nat_pptp_hook_expectfn, pptp_nat_expected); return 0; } diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index ce9edbcc01e..3ca98971a1e 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -293,8 +293,8 @@ static void __exit nf_nat_sip_fini(void) static int __init nf_nat_sip_init(void) { - BUG_ON(rcu_dereference(nf_nat_sip_hook)); - BUG_ON(rcu_dereference(nf_nat_sdp_hook)); + BUG_ON(nf_nat_sip_hook != NULL); + BUG_ON(nf_nat_sdp_hook != NULL); rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip); rcu_assign_pointer(nf_nat_sdp_hook, ip_nat_sdp); return 0; diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c index 0ecec701cb4..1360a94766d 100644 --- a/net/ipv4/netfilter/nf_nat_tftp.c +++ b/net/ipv4/netfilter/nf_nat_tftp.c @@ -43,7 +43,7 @@ static void __exit nf_nat_tftp_fini(void) static int __init nf_nat_tftp_init(void) { - BUG_ON(rcu_dereference(nf_nat_tftp_hook)); + BUG_ON(nf_nat_tftp_hook != NULL); rcu_assign_pointer(nf_nat_tftp_hook, help); return 0; } -- cgit v1.2.3-70-g09d2 From 429f08e950a88cd826b203ea898c2f2d0f7db9de Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 5 Nov 2007 21:03:24 -0800 Subject: [IPV4]: Consolidate the ip cork destruction in ip_output.c The ip_push_pending_frames and ip_flush_pending_frames do the same things to flush the sock's cork. Move this into a separate function and save ~80 bytes from the .text Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e5f7dc2de30..fd99fbd685e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1183,6 +1183,17 @@ error: return err; } +static void ip_cork_release(struct inet_sock *inet) +{ + inet->cork.flags &= ~IPCORK_OPT; + kfree(inet->cork.opt); + inet->cork.opt = NULL; + if (inet->cork.rt) { + ip_rt_put(inet->cork.rt); + inet->cork.rt = NULL; + } +} + /* * Combined all pending IP fragments on the socket as one IP datagram * and push them out. @@ -1276,13 +1287,7 @@ int ip_push_pending_frames(struct sock *sk) } out: - inet->cork.flags &= ~IPCORK_OPT; - kfree(inet->cork.opt); - inet->cork.opt = NULL; - if (inet->cork.rt) { - ip_rt_put(inet->cork.rt); - inet->cork.rt = NULL; - } + ip_cork_release(inet); return err; error: @@ -1295,19 +1300,12 @@ error: */ void ip_flush_pending_frames(struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); - inet->cork.flags &= ~IPCORK_OPT; - kfree(inet->cork.opt); - inet->cork.opt = NULL; - if (inet->cork.rt) { - ip_rt_put(inet->cork.rt); - inet->cork.rt = NULL; - } + ip_cork_release(inet_sk(sk)); } -- cgit v1.2.3-70-g09d2 From 6a9fb9479f2672fa392711735de9e642395c9a14 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 5 Nov 2007 21:32:31 -0800 Subject: [IPV4]: Clean the ip_sockglue.c from some ugly ifdefs The #idfed CONFIG_IP_MROUTE is sometimes places inside the if-s, which looks completely bad. Similar ifdefs inside the functions looks a bit better, but they are also not recommended to be used. Provide an ifdef-ed ip_mroute_opt() helper to cleanup the code. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- include/linux/mroute.h | 12 ++++++++++++ net/ipv4/ip_sockglue.c | 39 ++++++++++++--------------------------- 2 files changed, 24 insertions(+), 27 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 7da2cee8e13..35a8277ec1b 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -128,6 +128,18 @@ struct igmpmsg #ifdef __KERNEL__ #include +#ifdef CONFIG_IP_MROUTE +static inline int ip_mroute_opt(int opt) +{ + return (opt >= MRT_BASE) && (opt <= MRT_BASE + 10); +} +#else +static inline int ip_mroute_opt(int opt) +{ + return 0; +} +#endif + extern int ip_mroute_setsockopt(struct sock *, int, char __user *, int); extern int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); extern int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index f51f20e487c..82817e55436 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -437,10 +437,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, /* If optlen==0, it is equivalent to val == 0 */ -#ifdef CONFIG_IP_MROUTE - if (optname >= MRT_BASE && optname <= (MRT_BASE + 10)) + if (ip_mroute_opt(optname)) return ip_mroute_setsockopt(sk,optname,optval,optlen); -#endif err = 0; lock_sock(sk); @@ -909,11 +907,9 @@ int ip_setsockopt(struct sock *sk, int level, #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && - optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY -#ifdef CONFIG_IP_MROUTE - && (optname < MRT_BASE || optname > (MRT_BASE + 10)) -#endif - ) { + optname != IP_IPSEC_POLICY && + optname != IP_XFRM_POLICY && + !ip_mroute_opt(optname)) { lock_sock(sk); err = nf_setsockopt(sk, PF_INET, optname, optval, optlen); release_sock(sk); @@ -935,11 +931,9 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && - optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY -#ifdef CONFIG_IP_MROUTE - && (optname < MRT_BASE || optname > (MRT_BASE + 10)) -#endif - ) { + optname != IP_IPSEC_POLICY && + optname != IP_XFRM_POLICY && + !ip_mroute_opt(optname)) { lock_sock(sk); err = compat_nf_setsockopt(sk, PF_INET, optname, optval, optlen); @@ -967,11 +961,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (level != SOL_IP) return -EOPNOTSUPP; -#ifdef CONFIG_IP_MROUTE - if (optname >= MRT_BASE && optname <= MRT_BASE+10) { + if (ip_mroute_opt(optname)) return ip_mroute_getsockopt(sk,optname,optval,optlen); - } -#endif if (get_user(len,optlen)) return -EFAULT; @@ -1171,11 +1162,8 @@ int ip_getsockopt(struct sock *sk, int level, err = do_ip_getsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS -#ifdef CONFIG_IP_MROUTE - && (optname < MRT_BASE || optname > MRT_BASE+10) -#endif - ) { + if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && + !ip_mroute_opt(optname)) { int len; if (get_user(len,optlen)) @@ -1200,11 +1188,8 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, int err = do_ip_getsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ - if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS -#ifdef CONFIG_IP_MROUTE - && (optname < MRT_BASE || optname > MRT_BASE+10) -#endif - ) { + if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && + !ip_mroute_opt(optname)) { int len; if (get_user(len, optlen)) -- cgit v1.2.3-70-g09d2 From 286ab3d46058840d68e5d7d52e316c1f7e98c59f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Nov 2007 23:38:39 -0800 Subject: [NET]: Define infrastructure to keep 'inuse' changes in an efficent SMP/NUMA way. "struct proto" currently uses an array stats[NR_CPUS] to track change on 'inuse' sockets per protocol. If NR_CPUS is big, this means we use a big memory area for this. Moreover, all this memory area is located on a single node on NUMA machines, increasing memory pressure on the boot node. In this patch, I tried to : - Keep a fast !CONFIG_SMP implementation - Keep a fast CONFIG_SMP implementation for often used protocols (tcp,udp,raw,...) - Introduce a NUMA efficient implementation Some helper macros are defined in include/net/sock.h These macros take into account CONFIG_SMP If a "struct proto" is declared without using DEFINE_PROTO_INUSE / REF_PROTO_INUSE macros, it will automatically use a default implementation, using a dynamically allocated percpu zone. This default implementation will be NUMA efficient, but might use 32/64 bytes per possible cpu because of current alloc_percpu() implementation. However it still should be better than previous implementation based on stats[NR_CPUS] field. When a "struct proto" is changed to use the new macros, we use a single static "int" percpu variable, lowering the memory and cpu costs, still preserving NUMA efficiency. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++------ net/core/sock.c | 48 ++++++++++++++++++++++++++++++++++++++++- net/ipv4/proc.c | 19 ++++------------ net/ipv6/proc.c | 19 ++++------------ 4 files changed, 112 insertions(+), 37 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/sock.h b/include/net/sock.h index 20de3fa7ae4..5504fb9fa88 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -560,6 +560,14 @@ struct proto { void (*unhash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); +#ifdef CONFIG_SMP + /* Keeping track of sockets in use */ + void (*inuse_add)(struct proto *prot, int inc); + int (*inuse_getval)(const struct proto *prot); + int *inuse_ptr; +#else + int inuse; +#endif /* Memory pressure */ void (*enter_memory_pressure)(void); atomic_t *memory_allocated; /* Current allocated memory. */ @@ -592,12 +600,38 @@ struct proto { #ifdef SOCK_REFCNT_DEBUG atomic_t socks; #endif - struct { - int inuse; - u8 __pad[SMP_CACHE_BYTES - sizeof(int)]; - } stats[NR_CPUS]; }; +/* + * Special macros to let protos use a fast version of inuse{get|add} + * using a static percpu variable per proto instead of an allocated one, + * saving one dereference. + * This might be changed if/when dynamic percpu vars become fast. + */ +#ifdef CONFIG_SMP +# define DEFINE_PROTO_INUSE(NAME) \ +static DEFINE_PER_CPU(int, NAME##_inuse); \ +static void NAME##_inuse_add(struct proto *prot, int inc) \ +{ \ + __get_cpu_var(NAME##_inuse) += inc; \ +} \ + \ +static int NAME##_inuse_getval(const struct proto *prot)\ +{ \ + int res = 0, cpu; \ + \ + for_each_possible_cpu(cpu) \ + res += per_cpu(NAME##_inuse, cpu); \ + return res; \ +} +# define REF_PROTO_INUSE(NAME) \ + .inuse_add = NAME##_inuse_add, \ + .inuse_getval = NAME##_inuse_getval, +#else +# define DEFINE_PROTO_INUSE(NAME) +# define REF_PROTO_INUSE(NAME) +#endif + extern int proto_register(struct proto *prot, int alloc_slab); extern void proto_unregister(struct proto *prot); @@ -629,12 +663,29 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) /* Called with local bh disabled */ static __inline__ void sock_prot_inc_use(struct proto *prot) { - prot->stats[smp_processor_id()].inuse++; +#ifdef CONFIG_SMP + prot->inuse_add(prot, 1); +#else + prot->inuse++; +#endif } static __inline__ void sock_prot_dec_use(struct proto *prot) { - prot->stats[smp_processor_id()].inuse--; +#ifdef CONFIG_SMP + prot->inuse_add(prot, -1); +#else + prot->inuse--; +#endif +} + +static __inline__ int sock_prot_inuse(struct proto *proto) +{ +#ifdef CONFIG_SMP + return proto->inuse_getval(proto); +#else + return proto->inuse; +#endif } /* With per-bucket locks this operation is not-atomic, so that diff --git a/net/core/sock.c b/net/core/sock.c index 12ad2067a98..e077f263b73 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1801,12 +1801,41 @@ EXPORT_SYMBOL(sk_common_release); static DEFINE_RWLOCK(proto_list_lock); static LIST_HEAD(proto_list); +#ifdef CONFIG_SMP +/* + * Define default functions to keep track of inuse sockets per protocol + * Note that often used protocols use dedicated functions to get a speed increase. + * (see DEFINE_PROTO_INUSE/REF_PROTO_INUSE) + */ +static void inuse_add(struct proto *prot, int inc) +{ + per_cpu_ptr(prot->inuse_ptr, smp_processor_id())[0] += inc; +} + +static int inuse_get(const struct proto *prot) +{ + int res = 0, cpu; + for_each_possible_cpu(cpu) + res += per_cpu_ptr(prot->inuse_ptr, cpu)[0]; + return res; +} +#endif + int proto_register(struct proto *prot, int alloc_slab) { char *request_sock_slab_name = NULL; char *timewait_sock_slab_name; int rc = -ENOBUFS; +#ifdef CONFIG_SMP + if (!prot->inuse_getval || !prot->inuse_add) { + prot->inuse_ptr = alloc_percpu(int); + if (prot->inuse_ptr == NULL) + goto out; + prot->inuse_getval = inuse_get; + prot->inuse_add = inuse_add; + } +#endif if (alloc_slab) { prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, SLAB_HWCACHE_ALIGN, NULL); @@ -1814,7 +1843,7 @@ int proto_register(struct proto *prot, int alloc_slab) if (prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", prot->name); - goto out; + goto out_free_inuse; } if (prot->rsk_prot != NULL) { @@ -1873,6 +1902,15 @@ out_free_request_sock_slab_name: out_free_sock_slab: kmem_cache_destroy(prot->slab); prot->slab = NULL; +out_free_inuse: +#ifdef CONFIG_SMP + if (prot->inuse_ptr != NULL) { + free_percpu(prot->inuse_ptr); + prot->inuse_ptr = NULL; + prot->inuse_getval = NULL; + prot->inuse_add = NULL; + } +#endif goto out; } @@ -1884,6 +1922,14 @@ void proto_unregister(struct proto *prot) list_del(&prot->node); write_unlock(&proto_list_lock); +#ifdef CONFIG_SMP + if (prot->inuse_ptr != NULL) { + free_percpu(prot->inuse_ptr); + prot->inuse_ptr = NULL; + prot->inuse_getval = NULL; + prot->inuse_add = NULL; + } +#endif if (prot->slab != NULL) { kmem_cache_destroy(prot->slab); prot->slab = NULL; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ffdccc0972e..ce34b281803 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -46,17 +46,6 @@ #include #include -static int fold_prot_inuse(struct proto *proto) -{ - int res = 0; - int cpu; - - for_each_possible_cpu(cpu) - res += proto->stats[cpu].inuse; - - return res; -} - /* * Report socket allocation statistics [mea@utu.fi] */ @@ -64,12 +53,12 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", - fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), + sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), atomic_read(&tcp_memory_allocated)); - seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); - seq_printf(seq, "UDPLITE: inuse %d\n", fold_prot_inuse(&udplite_prot)); - seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); + seq_printf(seq, "UDP: inuse %d\n", sock_prot_inuse(&udp_prot)); + seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse(&udplite_prot)); + seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse(&raw_prot)); seq_printf(seq, "FRAG: inuse %d memory %d\n", ip_frag_nqueues(), ip_frag_mem()); return 0; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index be526ad9254..8631ed7fe8a 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -32,27 +32,16 @@ static struct proc_dir_entry *proc_net_devsnmp6; -static int fold_prot_inuse(struct proto *proto) -{ - int res = 0; - int cpu; - - for_each_possible_cpu(cpu) - res += proto->stats[cpu].inuse; - - return res; -} - static int sockstat6_seq_show(struct seq_file *seq, void *v) { seq_printf(seq, "TCP6: inuse %d\n", - fold_prot_inuse(&tcpv6_prot)); + sock_prot_inuse(&tcpv6_prot)); seq_printf(seq, "UDP6: inuse %d\n", - fold_prot_inuse(&udpv6_prot)); + sock_prot_inuse(&udpv6_prot)); seq_printf(seq, "UDPLITE6: inuse %d\n", - fold_prot_inuse(&udplitev6_prot)); + sock_prot_inuse(&udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", - fold_prot_inuse(&rawv6_prot)); + sock_prot_inuse(&rawv6_prot)); seq_printf(seq, "FRAG6: inuse %d memory %d\n", ip6_frag_nqueues(), ip6_frag_mem()); return 0; -- cgit v1.2.3-70-g09d2 From 47a31a6ffcca3b55149bccd5b99763e5eea60ac4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Nov 2007 23:39:16 -0800 Subject: [IPV4]: Use the {DEFINE|REF}_PROTO_INUSE infrastructure Trivial patch to make "tcp,udp,udplite,raw" protocols uses the fast "inuse sockets" infrastructure Each protocol use then a static percpu var, instead of a dynamic one. This saves some ram and some cpu cycles Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/raw.c | 3 +++ net/ipv4/tcp_ipv4.c | 3 +++ net/ipv4/udp.c | 3 +++ net/ipv4/udplite.c | 3 +++ 4 files changed, 12 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 3916faca3af..66b42f547bf 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -760,6 +760,8 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) } } +DEFINE_PROTO_INUSE(raw) + struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, @@ -781,6 +783,7 @@ struct proto raw_prot = { .compat_setsockopt = compat_raw_setsockopt, .compat_getsockopt = compat_raw_getsockopt, #endif + REF_PROTO_INUSE(raw) }; #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d438dfb0c8f..e9127cdced2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2417,6 +2417,8 @@ void tcp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ +DEFINE_PROTO_INUSE(tcp) + struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, @@ -2451,6 +2453,7 @@ struct proto tcp_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif + REF_PROTO_INUSE(tcp) }; void __init tcp_v4_init(struct net_proto_family *ops) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4bc25b46f33..03c400ca14c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1430,6 +1430,8 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) } +DEFINE_PROTO_INUSE(udp) + struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, @@ -1452,6 +1454,7 @@ struct proto udp_prot = { .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif + REF_PROTO_INUSE(udp) }; /* ------------------------------------------------------------------------ */ diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 94977205abb..f5baeb3e8b8 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -44,6 +44,8 @@ static struct net_protocol udplite_protocol = { .no_policy = 1, }; +DEFINE_PROTO_INUSE(udplite) + struct proto udplite_prot = { .name = "UDP-Lite", .owner = THIS_MODULE, @@ -67,6 +69,7 @@ struct proto udplite_prot = { .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif + REF_PROTO_INUSE(udplite) }; static struct inet_protosw udplite4_protosw = { -- cgit v1.2.3-70-g09d2 From c3e9a353d8fc64a82ab11a07e21902e25e1e96d1 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 6 Nov 2007 23:34:04 -0800 Subject: [IPV4]: Compact some ifdefs in the fib code. There are places that check for CONFIG_IP_MULTIPLE_TABLES twice in the same file, but the internals of these #ifdefs can be merged. As a side effect - remove one ifdef from inside a function. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- include/net/ip_fib.h | 15 ++++++--------- net/ipv4/fib_frontend.c | 15 ++++++++------- 2 files changed, 14 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 8cadc77c7df..ed514bfb61b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -185,6 +185,12 @@ static inline void fib_select_default(const struct flowi *flp, struct fib_result } #else /* CONFIG_IP_MULTIPLE_TABLES */ +extern void __init fib4_rules_init(void); + +#ifdef CONFIG_NET_CLS_ROUTE +extern u32 fib_rules_tclass(struct fib_result *res); +#endif + #define ip_fib_local_table fib_get_table(RT_TABLE_LOCAL) #define ip_fib_main_table fib_get_table(RT_TABLE_MAIN) @@ -214,15 +220,6 @@ extern __be32 __fib_res_prefsrc(struct fib_result *res); /* Exported by fib_hash.c */ extern struct fib_table *fib_hash_init(u32 id); -#ifdef CONFIG_IP_MULTIPLE_TABLES -extern void __init fib4_rules_init(void); - -#ifdef CONFIG_NET_CLS_ROUTE -extern u32 fib_rules_tclass(struct fib_result *res); -#endif - -#endif - static inline void fib_combine_itag(u32 *itag, struct fib_result *res) { #ifdef CONFIG_NET_CLS_ROUTE diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 60123905dbb..732d8f088b1 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -59,6 +59,13 @@ struct fib_table *ip_fib_main_table; #define FIB_TABLE_HASHSZ 1 static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; +static void __init fib4_rules_init(void) +{ + ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); + hlist_add_head_rcu(&ip_fib_local_table->tb_hlist, &fib_table_hash[0]); + ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); + hlist_add_head_rcu(&ip_fib_main_table->tb_hlist, &fib_table_hash[0]); +} #else #define FIB_TABLE_HASHSZ 256 @@ -905,14 +912,8 @@ void __init ip_fib_init(void) for (i = 0; i < FIB_TABLE_HASHSZ; i++) INIT_HLIST_HEAD(&fib_table_hash[i]); -#ifndef CONFIG_IP_MULTIPLE_TABLES - ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); - hlist_add_head_rcu(&ip_fib_local_table->tb_hlist, &fib_table_hash[0]); - ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); - hlist_add_head_rcu(&ip_fib_main_table->tb_hlist, &fib_table_hash[0]); -#else + fib4_rules_init(); -#endif register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); -- cgit v1.2.3-70-g09d2 From 4999f3621f4da622e77931b3d33ada6c7083c705 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 7 Nov 2007 02:21:47 -0800 Subject: [IPSEC]: Fix crypto_alloc_comp error checking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function crypto_alloc_comp returns an errno instead of NULL to indicate error. So it needs to be tested with IS_ERR. This is based on a patch by Vicenç Beltran Querol. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ipcomp.c | 3 ++- net/ipv6/ipcomp6.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index ca1b5fdb8d3..2c44a94c213 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -344,7 +345,7 @@ static struct crypto_comp **ipcomp_alloc_tfms(const char *alg_name) for_each_possible_cpu(cpu) { struct crypto_comp *tfm = crypto_alloc_comp(alg_name, 0, CRYPTO_ALG_ASYNC); - if (!tfm) + if (IS_ERR(tfm)) goto error; *per_cpu_ptr(tfms, cpu) = tfm; } diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 85eb4798d8d..0cd4056f912 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -358,7 +359,7 @@ static struct crypto_comp **ipcomp6_alloc_tfms(const char *alg_name) for_each_possible_cpu(cpu) { struct crypto_comp *tfm = crypto_alloc_comp(alg_name, 0, CRYPTO_ALG_ASYNC); - if (!tfm) + if (IS_ERR(tfm)) goto error; *per_cpu_ptr(tfms, cpu) = tfm; } -- cgit v1.2.3-70-g09d2 From 1e356f9cdfa885c78791d5d6e5d2baef22f01853 Mon Sep 17 00:00:00 2001 From: "Rumen G. Bogdanovski" Date: Wed, 7 Nov 2007 02:35:54 -0800 Subject: [IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov Signed-off-by: Rumen G. Bogdanovski Signed-off-by: Simon Horman --- include/net/ip_vs.h | 4 ++++ net/ipv4/ipvs/ip_vs_conn.c | 19 +++++++++++++++++++ net/ipv4/ipvs/ip_vs_ctl.c | 26 ++++++++++++++++++++++++++ net/ipv4/ipvs/ip_vs_sync.c | 24 ++++++++++++++++++++---- 4 files changed, 69 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 41870564df8..1fd1ee896f3 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -901,6 +901,10 @@ extern int ip_vs_use_count_inc(void); extern void ip_vs_use_count_dec(void); extern int ip_vs_control_init(void); extern void ip_vs_control_cleanup(void); +extern struct ip_vs_dest * +ip_vs_find_dest(__be32 daddr, __be16 dport, + __be32 vaddr, __be16 vport, __u16 protocol); +extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp); /* diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 4b702f708d3..b7eeae622d9 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -425,6 +425,25 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) } +/* + * Check if there is a destination for the connection, if so + * bind the connection to the destination. + */ +struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest; + + if ((cp) && (!cp->dest)) { + dest = ip_vs_find_dest(cp->daddr, cp->dport, + cp->vaddr, cp->vport, cp->protocol); + ip_vs_bind_dest(cp, dest); + return dest; + } else + return NULL; +} +EXPORT_SYMBOL(ip_vs_try_bind_dest); + + /* * Unbind a connection entry with its VS destination * Called by the ip_vs_conn_expire function. diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 7345fc252a2..3c4d22a468e 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -579,6 +579,32 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) return NULL; } +/* + * Find destination by {daddr,dport,vaddr,protocol} + * Cretaed to be used in ip_vs_process_message() in + * the backup synchronization daemon. It finds the + * destination to be bound to the received connection + * on the backup. + * + * ip_vs_lookup_real_service() looked promissing, but + * seems not working as expected. + */ +struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport, + __be32 vaddr, __be16 vport, __u16 protocol) +{ + struct ip_vs_dest *dest; + struct ip_vs_service *svc; + + svc = ip_vs_service_get(0, protocol, vaddr, vport); + if (!svc) + return NULL; + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest) + atomic_inc(&dest->refcnt); + ip_vs_service_put(svc); + return dest; +} +EXPORT_SYMBOL(ip_vs_find_dest); /* * Lookup dest by {svc,addr,port} in the destination trash. diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 0d4d9721cbd..b1694d67abb 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -284,6 +284,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) struct ip_vs_sync_conn_options *opt; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; + struct ip_vs_dest *dest; char *p; int i; @@ -317,20 +318,35 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) s->caddr, s->cport, s->vaddr, s->vport); if (!cp) { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + dest = ip_vs_find_dest(s->daddr, s->dport, + s->vaddr, s->vport, + s->protocol); cp = ip_vs_conn_new(s->protocol, s->caddr, s->cport, s->vaddr, s->vport, s->daddr, s->dport, - flags, NULL); + flags, dest); + if (dest) + atomic_dec(&dest->refcnt); if (!cp) { IP_VS_ERR("ip_vs_conn_new failed\n"); return; } cp->state = ntohs(s->state); } else if (!cp->dest) { - /* it is an entry created by the synchronization */ - cp->state = ntohs(s->state); - cp->flags = flags | IP_VS_CONN_F_HASHED; + dest = ip_vs_try_bind_dest(cp); + if (!dest) { + /* it is an unbound entry created by + * synchronization */ + cp->state = ntohs(s->state); + cp->flags = flags | IP_VS_CONN_F_HASHED; + } else + atomic_dec(&dest->refcnt); } /* Note that we don't touch its state and flags if it is a normal entry. */ -- cgit v1.2.3-70-g09d2 From efac52762b1e3fe3035d29e82d8ee1aebc45e4a7 Mon Sep 17 00:00:00 2001 From: "Rumen G. Bogdanovski" Date: Wed, 7 Nov 2007 02:36:55 -0800 Subject: [IPVS]: Synchronize closing of Connections This patch makes the master daemon to sync the connection when it is about to close. This makes the connections on the backup to close or timeout according their state. Before the sync was performed only if the connection is in ESTABLISHED state which always made the connections to timeout in the hard coded 3 minutes. However the Andy Gospodarek's patch ([IPVS]: use proper timeout instead of fixed value) effectively did nothing more than increasing this to 15 minutes (Established state timeout). So this patch makes use of proper timeout since it syncs the connections on status changes to FIN_WAIT (2min timeout) and CLOSE (10sec timeout). However if the backup misses CLOSE hopefully it did not miss FIN_WAIT. Otherwise we will just have to wait for the ESTABLISHED state timeout. As it is without this patch. This way the number of the hanging connections on the backup is kept to minimum. And very few of them will be left to timeout with a long timeout. This is important if we want to make use of the fix for the real server overcommit on master/backup fail-over. Signed-off-by: Rumen G. Bogdanovski Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- include/net/ip_vs.h | 4 ++++ net/ipv4/ipvs/ip_vs_core.c | 20 ++++++++++++++------ net/ipv4/ipvs/ip_vs_sync.c | 2 +- 3 files changed, 19 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 1fd1ee896f3..67ea2c0c0ab 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -520,6 +520,10 @@ struct ip_vs_conn { spinlock_t lock; /* lock for state transition */ volatile __u16 flags; /* status flags */ volatile __u16 state; /* state info */ + volatile __u16 old_state; /* old state, to be used for + * state transition triggerd + * synchronization + */ /* Control members */ struct ip_vs_conn *control; /* Master control connection */ diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index c6ed7654e83..20c884a5772 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -979,15 +979,23 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, ret = NF_ACCEPT; } - /* increase its packet counter and check if it is needed - to be synchronized */ + /* Increase its packet counter and check if it is needed + * to be synchronized + * + * Sync connection if it is about to close to + * encorage the standby servers to update the connections timeout + */ atomic_inc(&cp->in_pkts); if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && - (cp->protocol != IPPROTO_TCP || - cp->state == IP_VS_TCP_S_ESTABLISHED) && - (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] - == sysctl_ip_vs_sync_threshold[0])) + (((cp->protocol != IPPROTO_TCP || + cp->state == IP_VS_TCP_S_ESTABLISHED) && + (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] + == sysctl_ip_vs_sync_threshold[0])) || + ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && + ((cp->state == IP_VS_TCP_S_FIN_WAIT) || + (cp->state == IP_VS_TCP_S_CLOSE))))) ip_vs_sync_conn(cp); + cp->old_state = cp->state; ip_vs_conn_put(cp); return ret; diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index b1694d67abb..bd930efc18d 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -343,7 +343,6 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) if (!dest) { /* it is an unbound entry created by * synchronization */ - cp->state = ntohs(s->state); cp->flags = flags | IP_VS_CONN_F_HASHED; } else atomic_dec(&dest->refcnt); @@ -358,6 +357,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) p += SIMPLE_CONN_SIZE; atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); + cp->state = ntohs(s->state); pp = ip_vs_proto_get(s->protocol); cp->timeout = pp->timeout_table[cp->state]; ip_vs_conn_put(cp); -- cgit v1.2.3-70-g09d2 From 230140cffa7feae90ad50bf259db1fa07674f3a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 7 Nov 2007 02:40:20 -0800 Subject: [INET]: Remove per bucket rwlock in tcp/dccp ehash table. As done two years ago on IP route cache table (commit 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per hash bucket for the huge TCP/DCCP hash tables. On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle performance differences. (we hit a different cache line for the rwlock, but then the bucket cache line have a better sharing factor among cpus, since we dirty it less often). For netstat or ss commands that want a full scan of hash table, we perform fewer memory accesses. Using a 'small' table of hashed rwlocks should be more than enough to provide correct SMP concurrency between different buckets, without using too much memory. Sizing of this table depends on num_possible_cpus() and various CONFIG settings. This patch provides some locking abstraction that may ease a future work using a different model for TCP/DCCP table. Signed-off-by: Eric Dumazet Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 71 +++++++++++++++++++++++++++++++++++++++---- net/dccp/proto.c | 9 ++++-- net/ipv4/inet_diag.c | 9 +++--- net/ipv4/inet_hashtables.c | 7 +++-- net/ipv4/inet_timewait_sock.c | 13 ++++---- net/ipv4/tcp.c | 4 +-- net/ipv4/tcp_ipv4.c | 11 ++++--- net/ipv6/inet6_hashtables.c | 19 ++++++------ 8 files changed, 106 insertions(+), 37 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 4427dcd1e53..8461cda3749 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -37,7 +37,6 @@ * I'll experiment with dynamic table growth later. */ struct inet_ehash_bucket { - rwlock_t lock; struct hlist_head chain; struct hlist_head twchain; }; @@ -100,6 +99,9 @@ struct inet_hashinfo { * TIME_WAIT sockets use a separate chain (twchain). */ struct inet_ehash_bucket *ehash; + rwlock_t *ehash_locks; + unsigned int ehash_size; + unsigned int ehash_locks_mask; /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. @@ -107,7 +109,7 @@ struct inet_hashinfo { struct inet_bind_hashbucket *bhash; unsigned int bhash_size; - unsigned int ehash_size; + /* Note : 4 bytes padding on 64 bit arches */ /* All sockets in TCP_LISTEN state will be in here. This is the only * table where wildcard'd TCP sockets can exist. Hash function here @@ -134,6 +136,62 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket( return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)]; } +static inline rwlock_t *inet_ehash_lockp( + struct inet_hashinfo *hashinfo, + unsigned int hash) +{ + return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask]; +} + +static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) +{ + unsigned int i, size = 256; +#if defined(CONFIG_PROVE_LOCKING) + unsigned int nr_pcpus = 2; +#else + unsigned int nr_pcpus = num_possible_cpus(); +#endif + if (nr_pcpus >= 4) + size = 512; + if (nr_pcpus >= 8) + size = 1024; + if (nr_pcpus >= 16) + size = 2048; + if (nr_pcpus >= 32) + size = 4096; + if (sizeof(rwlock_t) != 0) { +#ifdef CONFIG_NUMA + if (size * sizeof(rwlock_t) > PAGE_SIZE) + hashinfo->ehash_locks = vmalloc(size * sizeof(rwlock_t)); + else +#endif + hashinfo->ehash_locks = kmalloc(size * sizeof(rwlock_t), + GFP_KERNEL); + if (!hashinfo->ehash_locks) + return ENOMEM; + for (i = 0; i < size; i++) + rwlock_init(&hashinfo->ehash_locks[i]); + } + hashinfo->ehash_locks_mask = size - 1; + return 0; +} + +static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) +{ + if (hashinfo->ehash_locks) { +#ifdef CONFIG_NUMA + unsigned int size = (hashinfo->ehash_locks_mask + 1) * + sizeof(rwlock_t); + if (size > PAGE_SIZE) + vfree(hashinfo->ehash_locks); + else +#else + kfree(hashinfo->ehash_locks); +#endif + hashinfo->ehash_locks = NULL; + } +} + extern struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct inet_bind_hashbucket *head, @@ -222,7 +280,7 @@ static inline void __inet_hash(struct inet_hashinfo *hashinfo, sk->sk_hash = inet_sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; - lock = &head->lock; + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); write_lock(lock); } __sk_add_node(sk, list); @@ -253,7 +311,7 @@ static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk) inet_listen_wlock(hashinfo); lock = &hashinfo->lhash_lock; } else { - lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock; + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); write_lock_bh(lock); } @@ -354,9 +412,10 @@ static inline struct sock * */ unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); prefetch(head->chain.first); - read_lock(&head->lock); + read_lock(lock); sk_for_each(sk, node, &head->chain) { if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ @@ -369,7 +428,7 @@ static inline struct sock * } sk = NULL; out: - read_unlock(&head->lock); + read_unlock(lock); return sk; hit: sock_hold(sk); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index d8497392803..7a3bea9c28c 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1072,11 +1072,13 @@ static int __init dccp_init(void) } for (i = 0; i < dccp_hashinfo.ehash_size; i++) { - rwlock_init(&dccp_hashinfo.ehash[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain); } + if (inet_ehash_locks_alloc(&dccp_hashinfo)) + goto out_free_dccp_ehash; + bhash_order = ehash_order; do { @@ -1091,7 +1093,7 @@ static int __init dccp_init(void) if (!dccp_hashinfo.bhash) { DCCP_CRIT("Failed to allocate DCCP bind hash table"); - goto out_free_dccp_ehash; + goto out_free_dccp_locks; } for (i = 0; i < dccp_hashinfo.bhash_size; i++) { @@ -1121,6 +1123,8 @@ out_free_dccp_mib: out_free_dccp_bhash: free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); dccp_hashinfo.bhash = NULL; +out_free_dccp_locks: + inet_ehash_locks_free(&dccp_hashinfo); out_free_dccp_ehash: free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); dccp_hashinfo.ehash = NULL; @@ -1139,6 +1143,7 @@ static void __exit dccp_fini(void) free_pages((unsigned long)dccp_hashinfo.ehash, get_order(dccp_hashinfo.ehash_size * sizeof(struct inet_ehash_bucket))); + inet_ehash_locks_free(&dccp_hashinfo); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_ackvec_exit(); dccp_sysctl_exit(); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index dc429b6b0ba..b0170732b5e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -747,13 +747,14 @@ skip_listen_ht: for (i = s_i; i < hashinfo->ehash_size; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; + rwlock_t *lock = inet_ehash_lockp(hashinfo, i); struct sock *sk; struct hlist_node *node; if (i > s_i) s_num = 0; - read_lock_bh(&head->lock); + read_lock_bh(lock); num = 0; sk_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); @@ -769,7 +770,7 @@ skip_listen_ht: r->id.idiag_dport) goto next_normal; if (inet_csk_diag_dump(sk, skb, cb) < 0) { - read_unlock_bh(&head->lock); + read_unlock_bh(lock); goto done; } next_normal: @@ -791,14 +792,14 @@ next_normal: r->id.idiag_dport) goto next_dying; if (inet_twsk_diag_dump(tw, skb, cb) < 0) { - read_unlock_bh(&head->lock); + read_unlock_bh(lock); goto done; } next_dying: ++num; } } - read_unlock_bh(&head->lock); + read_unlock_bh(lock); } done: diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 16eecc7046a..67704da04fc 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -204,12 +204,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_node *node; struct inet_timewait_sock *tw; prefetch(head->chain.first); - write_lock(&head->lock); + write_lock(lock); /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &head->twchain) { @@ -239,7 +240,7 @@ unique: BUG_TRAP(sk_unhashed(sk)); __sk_add_node(sk, &head->chain); sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); + write_unlock(lock); if (twp) { *twp = tw; @@ -255,7 +256,7 @@ unique: return 0; not_unique: - write_unlock(&head->lock); + write_unlock(lock); return -EADDRNOTAVAIL; } diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 4e189e28f30..a60b99e0ebd 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_bind_hashbucket *bhead; struct inet_bind_bucket *tb; /* Unlink from established hashes. */ - struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); - write_lock(&ehead->lock); + write_lock(lock); if (hlist_unhashed(&tw->tw_node)) { - write_unlock(&ehead->lock); + write_unlock(lock); return; } __hlist_del(&tw->tw_node); sk_node_init(&tw->tw_node); - write_unlock(&ehead->lock); + write_unlock(lock); /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; @@ -59,6 +59,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); struct inet_bind_hashbucket *bhead; /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in @@ -71,7 +72,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); - write_lock(&ehead->lock); + write_lock(lock); /* Step 2: Remove SK from established hash. */ if (__sk_del_node_init(sk)) @@ -81,7 +82,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, inet_twsk_add_node(tw, &ehead->twchain); atomic_inc(&tw->tw_refcnt); - write_unlock(&ehead->lock); + write_unlock(lock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c64072bb504..8e65182f7af 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2456,11 +2456,11 @@ void __init tcp_init(void) thash_entries ? 0 : 512 * 1024); tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; for (i = 0; i < tcp_hashinfo.ehash_size; i++) { - rwlock_init(&tcp_hashinfo.ehash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); } - + if (inet_ehash_locks_alloc(&tcp_hashinfo)) + panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e9127cdced2..e566f3c6767 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2049,8 +2049,9 @@ static void *established_get_first(struct seq_file *seq) struct sock *sk; struct hlist_node *node; struct inet_timewait_sock *tw; + rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); - read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_lock_bh(lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family) { continue; @@ -2067,7 +2068,7 @@ static void *established_get_first(struct seq_file *seq) rc = tw; goto out; } - read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_unlock_bh(lock); st->state = TCP_SEQ_STATE_ESTABLISHED; } out: @@ -2094,11 +2095,11 @@ get_tw: cur = tw; goto out; } - read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); st->state = TCP_SEQ_STATE_ESTABLISHED; if (++st->bucket < tcp_hashinfo.ehash_size) { - read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); } else { cur = NULL; @@ -2206,7 +2207,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: if (v) - read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); break; } } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index d6f1026f194..adc73adadfa 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -37,9 +37,8 @@ void __inet6_hash(struct inet_hashinfo *hashinfo, } else { unsigned int hash; sk->sk_hash = hash = inet6_sk_ehashfn(sk); - hash &= (hashinfo->ehash_size - 1); - list = &hashinfo->ehash[hash].chain; - lock = &hashinfo->ehash[hash].lock; + list = &inet_ehash_bucket(hashinfo, hash)->chain; + lock = inet_ehash_lockp(hashinfo, hash); write_lock(lock); } @@ -70,9 +69,10 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo, */ unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); prefetch(head->chain.first); - read_lock(&head->lock); + read_lock(lock); sk_for_each(sk, node, &head->chain) { /* For IPV6 do the cheaper port and family tests first. */ if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif)) @@ -92,12 +92,12 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo, goto hit; } } - read_unlock(&head->lock); + read_unlock(lock); return NULL; hit: sock_hold(sk); - read_unlock(&head->lock); + read_unlock(lock); return sk; } EXPORT_SYMBOL(__inet6_lookup_established); @@ -175,12 +175,13 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, const unsigned int hash = inet6_ehashfn(daddr, lport, saddr, inet->dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_node *node; struct inet_timewait_sock *tw; prefetch(head->chain.first); - write_lock(&head->lock); + write_lock(lock); /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &head->twchain) { @@ -216,7 +217,7 @@ unique: __sk_add_node(sk, &head->chain); sk->sk_hash = hash; sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); + write_unlock(lock); if (twp != NULL) { *twp = tw; @@ -231,7 +232,7 @@ unique: return 0; not_unique: - write_unlock(&head->lock); + write_unlock(lock); return -EADDRNOTAVAIL; } -- cgit v1.2.3-70-g09d2