summaryrefslogtreecommitdiffstats
path: root/include/net
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2008-10-29 01:41:45 -0700
committerDavid S. Miller <davem@davemloft.net>2008-10-29 01:41:45 -0700
commit645ca708f936b2fbeb79e52d7823e3eb2c0905f8 (patch)
treeb384696994ee3cb04759a7bfffc29a48e4bf40f6 /include/net
parentb189db5d299c6824780af5590564ff608adb3dea (diff)
udp: introduce struct udp_table and multiple spinlocks
UDP sockets are hashed in a 128 slots hash table. This hash table is protected by *one* rwlock. This rwlock is readlocked each time an incoming UDP message is handled. This rwlock is writelocked each time a socket must be inserted in hash table (bind time), or deleted from this table (close time) This is not scalable on SMP machines : 1) Even in read mode, lock() and unlock() are atomic operations and must dirty a contended cache line, shared by all cpus. 2) A writer might be starved if many readers are 'in flight'. This can happen on a machine with some NIC receiving many UDP messages. User process can be delayed a long time at socket creation/dismantle time. This patch prepares RCU migration, by introducing 'struct udp_table and struct udp_hslot', and using one spinlock per chain, to reduce contention on central rwlock. Introducing one spinlock per chain reduces latencies, for port randomization on heavily loaded UDP servers. This also speedup bindings to specific ports. udp_lib_unhash() was uninlined, becoming to big. Some cleanups were done to ease review of following patch (RCUification of UDP Unicast lookups) Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include/net')
-rw-r--r--include/net/sock.h2
-rw-r--r--include/net/udp.h25
-rw-r--r--include/net/udplite.h2
3 files changed, 14 insertions, 15 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index d6b750a2507..d200dfbe1ef 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -599,7 +599,7 @@ struct proto {
union {
struct inet_hashinfo *hashinfo;
- struct hlist_head *udp_hash;
+ struct udp_table *udp_table;
struct raw_hashinfo *raw_hash;
} h;
diff --git a/include/net/udp.h b/include/net/udp.h
index 1e205095ea6..df2bfe54537 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -50,8 +50,15 @@ struct udp_skb_cb {
};
#define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb))
-extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
-extern rwlock_t udp_hash_lock;
+struct udp_hslot {
+ struct hlist_head head;
+ spinlock_t lock;
+} __attribute__((aligned(2 * sizeof(long))));
+struct udp_table {
+ struct udp_hslot hash[UDP_HTABLE_SIZE];
+};
+extern struct udp_table udp_table;
+extern void udp_table_init(struct udp_table *);
/* Note: this must match 'valbool' in sock_setsockopt */
@@ -110,15 +117,7 @@ static inline void udp_lib_hash(struct sock *sk)
BUG();
}
-static inline void udp_lib_unhash(struct sock *sk)
-{
- write_lock_bh(&udp_hash_lock);
- if (sk_del_node_init(sk)) {
- inet_sk(sk)->num = 0;
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- }
- write_unlock_bh(&udp_hash_lock);
-}
+extern void udp_lib_unhash(struct sock *sk);
static inline void udp_lib_close(struct sock *sk, long timeout)
{
@@ -187,7 +186,7 @@ extern struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
struct udp_seq_afinfo {
char *name;
sa_family_t family;
- struct hlist_head *hashtable;
+ struct udp_table *udp_table;
struct file_operations seq_fops;
struct seq_operations seq_ops;
};
@@ -196,7 +195,7 @@ struct udp_iter_state {
struct seq_net_private p;
sa_family_t family;
int bucket;
- struct hlist_head *hashtable;
+ struct udp_table *udp_table;
};
#ifdef CONFIG_PROC_FS
diff --git a/include/net/udplite.h b/include/net/udplite.h
index b76b2e377af..afdffe607b2 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -11,7 +11,7 @@
#define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */
extern struct proto udplite_prot;
-extern struct hlist_head udplite_hash[UDP_HTABLE_SIZE];
+extern struct udp_table udplite_table;
/*
* Checksum computation is all in software, hence simpler getfrag.