Re: Multicast packet loss

Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]
From: Eric Dumazet
Date: Saturday, February 28, 2009 - 1:51 am

Kenny Chang a écrit :

Hi Kenny

I am investigating how to reduce contention (and schedule() calls) on this workload.

Following patch already gave me less packet drops (but not yet *perfect*)
(10% packet loss instead of 30%, if 8 receivers on my 8 cpus machine)


David, this is a preliminary work, not meant for inclusion as is,
comments are welcome.

Thank you

[PATCH] net: sk_forward_alloc becomes an atomic_t

Commit 95766fff6b9a78d11fc2d3812dd035381690b55d
(UDP: Add memory accounting) introduced a regression for high rate UDP flows,
because of extra lock_sock() in udp_recvmsg()

In order to reduce need for lock_sock() in UDP receive path, we might need
to declare sk_forward_alloc as an atomic_t.

udp_recvmsg() can avoid a lock_sock()/release_sock() pair.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/net/sock.h   |   14 +++++++-------
 net/core/sock.c      |   31 +++++++++++++++++++------------
 net/core/stream.c    |    2 +-
 net/ipv4/af_inet.c   |    2 +-
 net/ipv4/inet_diag.c |    2 +-
 net/ipv4/tcp_input.c |    2 +-
 net/ipv4/udp.c       |    2 --
 net/ipv6/udp.c       |    2 --
 net/sched/em_meta.c  |    2 +-
 9 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 4bb1ff9..c4befb9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -250,7 +250,7 @@ struct sock {
 	struct sk_buff_head	sk_async_wait_queue;
 #endif
 	int			sk_wmem_queued;
-	int			sk_forward_alloc;
+	atomic_t		sk_forward_alloc;
 	gfp_t			sk_allocation;
 	int			sk_route_caps;
 	int			sk_gso_type;
@@ -823,7 +823,7 @@ static inline int sk_wmem_schedule(struct sock *sk, int size)
 {
 	if (!sk_has_account(sk))
 		return 1;
-	return size <= sk->sk_forward_alloc ||
+	return size <= atomic_read(&sk->sk_forward_alloc) ||
 		__sk_mem_schedule(sk, size, SK_MEM_SEND);
 }
 
@@ -831,7 +831,7 @@ static inline int sk_rmem_schedule(struct sock *sk, int size)
 {
 	if (!sk_has_account(sk))
 		return 1;
-	return size <= sk->sk_forward_alloc ||
+	return size <= atomic_read(&sk->sk_forward_alloc) ||
 		__sk_mem_schedule(sk, size, SK_MEM_RECV);
 }
 
@@ -839,7 +839,7 @@ static inline void sk_mem_reclaim(struct sock *sk)
 {
 	if (!sk_has_account(sk))
 		return;
-	if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
+	if (atomic_read(&sk->sk_forward_alloc) >= SK_MEM_QUANTUM)
 		__sk_mem_reclaim(sk);
 }
 
@@ -847,7 +847,7 @@ static inline void sk_mem_reclaim_partial(struct sock *sk)
 {
 	if (!sk_has_account(sk))
 		return;
-	if (sk->sk_forward_alloc > SK_MEM_QUANTUM)
+	if (atomic_read(&sk->sk_forward_alloc) > SK_MEM_QUANTUM)
 		__sk_mem_reclaim(sk);
 }
 
@@ -855,14 +855,14 @@ static inline void sk_mem_charge(struct sock *sk, int size)
 {
 	if (!sk_has_account(sk))
 		return;
-	sk->sk_forward_alloc -= size;
+	atomic_sub(size, &sk->sk_forward_alloc);
 }
 
 static inline void sk_mem_uncharge(struct sock *sk, int size)
 {
 	if (!sk_has_account(sk))
 		return;
-	sk->sk_forward_alloc += size;
+	atomic_add(size, &sk->sk_forward_alloc);
 }
 
 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
diff --git a/net/core/sock.c b/net/core/sock.c
index 0620046..8489105 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1081,7 +1081,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 
 		newsk->sk_dst_cache	= NULL;
 		newsk->sk_wmem_queued	= 0;
-		newsk->sk_forward_alloc = 0;
+		atomic_set(&newsk->sk_forward_alloc, 0);
 		newsk->sk_send_head	= NULL;
 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
 
@@ -1479,7 +1479,7 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
 	int amt = sk_mem_pages(size);
 	int allocated;
 
-	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+	atomic_add(amt * SK_MEM_QUANTUM, &sk->sk_forward_alloc);
 	allocated = atomic_add_return(amt, prot->memory_allocated);
 
 	/* Under limit. */
@@ -1520,7 +1520,7 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
 		if (prot->sysctl_mem[2] > alloc *
 		    sk_mem_pages(sk->sk_wmem_queued +
 				 atomic_read(&sk->sk_rmem_alloc) +
-				 sk->sk_forward_alloc))
+				 atomic_read(&sk->sk_forward_alloc)))
 			return 1;
 	}
 
@@ -1537,7 +1537,7 @@ suppress_allocation:
 	}
 
 	/* Alas. Undo changes. */
-	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
+	atomic_sub(amt * SK_MEM_QUANTUM, &sk->sk_forward_alloc);
 	atomic_sub(amt, prot->memory_allocated);
 	return 0;
 }
@@ -1551,14 +1551,21 @@ EXPORT_SYMBOL(__sk_mem_schedule);
 void __sk_mem_reclaim(struct sock *sk)
 {
 	struct proto *prot = sk->sk_prot;
-
-	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
-		   prot->memory_allocated);
-	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
-
-	if (prot->memory_pressure && *prot->memory_pressure &&
-	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
-		*prot->memory_pressure = 0;
+	int val = atomic_read(&sk->sk_forward_alloc);
+
+begin:
+	val = atomic_read(&sk->sk_forward_alloc);
+	if (val >= SK_MEM_QUANTUM) {
+		if (atomic_cmpxchg(&sk->sk_forward_alloc, val,
+				   val & (SK_MEM_QUANTUM - 1)) != val)
+			goto begin;
+		atomic_sub(val >> SK_MEM_QUANTUM_SHIFT,
+			   prot->memory_allocated);
+
+		if (prot->memory_pressure && *prot->memory_pressure &&
+		    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
+			*prot->memory_pressure = 0;
+	}
 }
 
 EXPORT_SYMBOL(__sk_mem_reclaim);
diff --git a/net/core/stream.c b/net/core/stream.c
index 8727cea..4d04d28 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -198,7 +198,7 @@ void sk_stream_kill_queues(struct sock *sk)
 	sk_mem_reclaim(sk);
 
 	WARN_ON(sk->sk_wmem_queued);
-	WARN_ON(sk->sk_forward_alloc);
+	WARN_ON(atomic_read(&sk->sk_forward_alloc));
 
 	/* It is _impossible_ for the backlog to contain anything
 	 * when we get here.  All user references to this socket
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 627be4d..7a1475c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -152,7 +152,7 @@ void inet_sock_destruct(struct sock *sk)
 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 	WARN_ON(sk->sk_wmem_queued);
-	WARN_ON(sk->sk_forward_alloc);
+	WARN_ON(atomic_read(&sk->sk_forward_alloc));
 
 	kfree(inet->opt);
 	dst_release(sk->sk_dst_cache);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 588a779..903ad66 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -158,7 +158,7 @@ static int inet_csk_diag_fill(struct sock *sk,
 	if (minfo) {
 		minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc);
 		minfo->idiag_wmem = sk->sk_wmem_queued;
-		minfo->idiag_fmem = sk->sk_forward_alloc;
+		minfo->idiag_fmem = atomic_read(&sk->sk_forward_alloc);
 		minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc);
 	}
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a6961d7..5e08f37 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5258,7 +5258,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 
 				tcp_rcv_rtt_measure_ts(sk, skb);
 
-				if ((int)skb->truesize > sk->sk_forward_alloc)
+				if ((int)skb->truesize > atomic_read(&sk->sk_forward_alloc))
 					goto step5;
 
 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4bd178a..dcc246a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -955,9 +955,7 @@ try_again:
 		err = ulen;
 
 out_free:
-	lock_sock(sk);
 	skb_free_datagram(sk, skb);
-	release_sock(sk);
 out:
 	return err;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 84b1a29..582b80a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -257,9 +257,7 @@ try_again:
 		err = ulen;
 
 out_free:
-	lock_sock(sk);
 	skb_free_datagram(sk, skb);
-	release_sock(sk);
 out:
 	return err;
 
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 72cf86e..94d90b6 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -383,7 +383,7 @@ META_COLLECTOR(int_sk_wmem_queued)
 META_COLLECTOR(int_sk_fwd_alloc)
 {
 	SKIP_NONLOCAL(skb);
-	dst->value = skb->sk->sk_forward_alloc;
+	dst->value = atomic_read(&skb->sk->sk_forward_alloc);
 }
 
 META_COLLECTOR(int_sk_sndbuf)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]

Messages in current thread:
Re: Multicast packet loss, Wes Chow, (Mon Feb 2, 12:51 pm)
Re: Multicast packet loss, Eric Dumazet, (Mon Feb 2, 1:29 pm)
Re: Multicast packet loss, Wes Chow, (Mon Feb 2, 2:09 pm)
Re: Multicast packet loss, Eric Dumazet, (Mon Feb 2, 2:31 pm)
Re: Multicast packet loss, Kenny Chang, (Tue Feb 3, 10:34 am)
Re: Multicast packet loss, Neil Horman, (Tue Feb 3, 6:21 pm)
Re: Multicast packet loss, Kenny Chang, (Thu Feb 26, 10:15 am)
Re: Multicast packet loss, Eric Dumazet, (Sat Feb 28, 1:51 am)
Re: Multicast packet loss, Eric Dumazet, (Sun Mar 1, 10:03 am)
Re: Multicast packet loss, David Miller, (Wed Mar 4, 1:16 am)
Re: Multicast packet loss, Eric Dumazet, (Wed Mar 4, 1:36 am)
Re: Multicast packet loss, Eric Dumazet, (Sat Mar 7, 12:46 am)
Re: Multicast packet loss, Eric Dumazet, (Sun Mar 8, 9:46 am)
Re: Multicast packet loss, David Miller, (Sun Mar 8, 7:49 pm)
Re: Multicast packet loss, Eric Dumazet, (Sun Mar 8, 11:36 pm)
Re: Multicast packet loss, Brian Bloniarz, (Mon Mar 9, 3:56 pm)
Re: Multicast packet loss, Eric Dumazet, (Mon Mar 9, 10:28 pm)
Re: Multicast packet loss, Brian Bloniarz, (Tue Mar 10, 4:22 pm)
Re: Multicast packet loss, Eric Dumazet, (Tue Mar 10, 8:00 pm)
Re: Multicast packet loss, Brian Bloniarz, (Thu Mar 12, 8:47 am)
Re: Multicast packet loss, Eric Dumazet, (Thu Mar 12, 9:34 am)
Re: Multicast packet loss, David Miller, (Fri Mar 13, 2:51 pm)
Re: Multicast packet loss, Eric Dumazet, (Fri Mar 13, 3:30 pm)
Re: Multicast packet loss, David Miller, (Fri Mar 13, 3:38 pm)
Re: Multicast packet loss, Eric Dumazet, (Fri Mar 13, 3:45 pm)
[PATCH] net: reorder fields of struct socket, Eric Dumazet, (Sat Mar 14, 2:03 am)
Re: [PATCH] net: reorder fields of struct socket, David Miller, (Sun Mar 15, 7:59 pm)
Re: Multicast packet loss, Eric Dumazet, (Mon Mar 16, 3:22 pm)
Re: Multicast packet loss, Peter Zijlstra, (Tue Mar 17, 3:11 am)
Re: Multicast packet loss, Eric Dumazet, (Tue Mar 17, 4:08 am)
Re: Multicast packet loss, Peter Zijlstra, (Tue Mar 17, 4:57 am)
Re: Multicast packet loss, Brian Bloniarz, (Tue Mar 17, 8:00 am)
Re: Multicast packet loss, Eric Dumazet, (Tue Mar 17, 8:16 am)
Re: Multicast packet loss, David Stevens, (Tue Mar 17, 12:39 pm)
Re: Multicast packet loss, Eric Dumazet, (Tue Mar 17, 2:19 pm)
Re: Multicast packet loss, Brian Bloniarz, (Fri Apr 3, 12:28 pm)
Re: Multicast packet loss, Eric Dumazet, (Sun Apr 5, 6:49 am)
Re: Multicast packet loss, Brian Bloniarz, (Mon Apr 6, 2:53 pm)
Re: Multicast packet loss, Brian Bloniarz, (Mon Apr 6, 3:12 pm)
Re: Multicast packet loss, Brian Bloniarz, (Tue Apr 7, 1:08 pm)
Re: Multicast packet loss, Eric Dumazet, (Wed Apr 8, 1:12 am)