After some more audit and coffee, I finally found one subtle bug in our
connect() code, that periodically triggers but never got tracked.
Here is a patch cooked on top of current linux-2.6 git tree, it should probably
apply on 2.6.31.6 as well...
Thanks
[PATCH] tcp: Fix a connect() race with timewait sockets
When we find a timewait connection in __inet_hash_connect() and reuse
it for a new connection request, we have a race window, releasing bind
list lock and reacquiring it in __inet_twsk_kill() to remove timewait
socket from list.
Another thread might find the timewait socket we already chose, leading to
list corruption and crashes.
Fix is to remove timewait socket from bind list before releasing the lock.
Reported-by: kapil dakhane <kdakhane@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
include/net/inet_timewait_sock.h | 4 +++
net/ipv4/inet_hashtables.c | 4 +++
net/ipv4/inet_timewait_sock.c | 37 ++++++++++++++++++++---------
3 files changed, 34 insertions(+), 11 deletions(-)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index f93ad90..e18e5df 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -206,6 +206,10 @@ extern void __inet_twsk_hashdance(struct inet_timewait_sock *tw,
struct sock *sk,
struct inet_hashinfo *hashinfo);
+extern void inet_twsk_unhash(struct inet_timewait_sock *tw,
+ struct inet_hashinfo *hashinfo,
+ bool mustlock);
+
extern void inet_twsk_schedule(struct inet_timewait_sock *tw,
struct inet_timewait_death_row *twdr,
const int timeo, const int timewait_len);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 625cc5f..76d81e4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -488,6 +488,10 @@ ok:
inet_sk(sk)->sport = htons(port);
hash(sk);
}
+
+ if (tw)
+ inet_twsk_unhash(tw, hinfo, false);
+
spin_unlock(&head->lock);
if (tw) {
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 13f0781..2d6d543 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -14,12 +14,34 @@
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
+
+void inet_twsk_unhash(struct inet_timewait_sock *tw,
+ struct inet_hashinfo *hashinfo,
+ bool mustlock)
+{
+ struct inet_bind_hashbucket *bhead;
+ struct inet_bind_bucket *tb = tw->tw_tb;
+
+ if (!tb)
+ return;
+
+ /* Disassociate with bind bucket. */
+ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw),
+ tw->tw_num,
+ hashinfo->bhash_size)];
+ if (mustlock)
+ spin_lock(&bhead->lock);
+ __hlist_del(&tw->tw_bind_node);
+ tw->tw_tb = NULL;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ if (mustlock)
+ spin_unlock(&bhead->lock);
+}
+
/* Must be called with locally disabled BHs. */
static void __inet_twsk_kill(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo)
{
- struct inet_bind_hashbucket *bhead;
- struct inet_bind_bucket *tb;
/* Unlink from established hashes. */
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
@@ -32,15 +54,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
sk_nulls_node_init(&tw->tw_node);
spin_unlock(lock);
- /* Disassociate with bind bucket. */
- bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
- hashinfo->bhash_size)];
- spin_lock(&bhead->lock);
- tb = tw->tw_tb;
- __hlist_del(&tw->tw_bind_node);
- tw->tw_tb = NULL;
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
- spin_unlock(&bhead->lock);
+ inet_twsk_unhash(tw, hashinfo, true);
+
#ifdef SOCK_REFCNT_DEBUG
if (atomic_read(&tw->tw_refcnt) != 1) {
printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html