[PATCH v3 3/7] Add dst_feature to query route entry features

Previous thread: Re: VLAN and ARP failure on tg3 drivers by Gertjan Hofman on Sunday, October 25, 2009 - 9:30 pm. (10 messages)

Next thread: [PATCH net-next-2.6] net: sysfs: ethtool_ops can be NULL by Eric Dumazet on Monday, October 26, 2009 - 4:23 am. (3 messages)
From: Gilad Ben-Yossef
Date: Monday, October 26, 2009 - 1:06 am

Third iteration of patch to allow disablng of TCP SACK, DSCAK, 
time stamp and window scale TCP options on a per route basis, now
with 100% less remote DoS opportunities (thank you Ilpo for 
spotting it ;-)

You usualy want to disable SACK, DSACK, time stamp or window
scale if you've got a piece of broken networking equipment somewhere 
as a stop gap until you can bring a big enough hammer to deal with
the broken network equipment. It doesn't make sense to "punish" the
entire connections going through the machine to destinations not 
related to the broken equipment.

This is doubly true when you're dealing with network containers
used to isolate several virtual domains.

Per route options implemented in free bits in the features route
entry property, which in some cases were reserved by name for these
options, so this does not inflate any structure.

Global sysctl based kill switches for these options are still
preserved, as some people seems to want them, so behaviour
is default to on, unless switched off either globaly or on
per route basis.

Tested on x86 using Qemu/KVM.  

Working but crude matching patch to iproute2 sent earlier to the list.

Patchset based on original work by Ori Finkelman and Yony Amit 
from ComSleep Ltd.

Gilad Ben-Yossef (7):
  Only parse time stamp TCP option in time wait sock
  Allow tcp_parse_options to consult dst entry
  Infrastructure for querying route entry features
  Add the no SACK route option feature
  Allow disabling TCP timestamp options per route
  Allow to turn off TCP window scale opt per route
  Allow disabling of DSACK TCP option per route

 include/linux/rtnetlink.h |    6 ++++--
 include/net/dst.h         |    8 +++++++-
 include/net/tcp.h         |    3 ++-
 net/ipv4/syncookies.c     |   27 ++++++++++++++-------------
 net/ipv4/tcp_input.c      |   26 ++++++++++++++++++--------
 net/ipv4/tcp_ipv4.c       |   21 ++++++++++++---------
 net/ipv4/tcp_minisocks.c  |    8 +++++---
 net/ipv4/tcp_output.c     |   18 ...
From: Gilad Ben-Yossef
Date: Monday, October 26, 2009 - 1:06 am

Add and use no window scale bit in the features field.

Note that this is not the same as setting a window scale of 0 
as would happen with window limit on route.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>
---
 include/linux/rtnetlink.h |    1 +
 net/ipv4/tcp_input.c      |    3 ++-
 net/ipv4/tcp_output.c     |    6 ++++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 2ab8c75..6784b34 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -380,6 +380,7 @@ enum
 #define RTAX_FEATURE_NO_SACK	0x00000002
 #define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
+#define RTAX_FEATURE_NO_WSCALE	0x00000010
 
 struct rta_session
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d2f9742..4f5e914 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3739,7 +3739,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 				break;
 			case TCPOPT_WINDOW:
 				if (opsize == TCPOLEN_WINDOW && th->syn &&
-				    !estab && sysctl_tcp_window_scaling) {
+				    !estab && sysctl_tcp_window_scaling &&
+				    !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)) {
 					__u8 snd_wscale = *(__u8 *)ptr;
 					opt_rx->wscale_ok = 1;
 					if (snd_wscale > 14) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8f30c18..ff60a21 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -496,7 +496,8 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		opts->tsecr = tp->rx_opt.ts_recent;
 		size += TCPOLEN_TSTAMP_ALIGNED;
 	}
-	if (likely(sysctl_tcp_window_scaling)) {
+	if (likely(sysctl_tcp_window_scaling &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_WSCALE))) {
 		opts->ws = tp->rx_opt.rcv_wscale;
 		opts->options |= OPTION_WSCALE;
 		size += ...
From: Gilad Ben-Yossef
Date: Monday, October 26, 2009 - 1:06 am

Add and use no DSCAK bit in the features field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/linux/rtnetlink.h |    1 +
 net/ipv4/tcp_input.c      |    8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 6784b34..e78b60c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -381,6 +381,7 @@ enum
 #define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 #define RTAX_FEATURE_NO_WSCALE	0x00000010
+#define RTAX_FEATURE_NO_DSACK	0x00000020
 
 struct rta_session
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4f5e914..4262da5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4080,8 +4080,10 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
 static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
 
-	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+	if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
+	    !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
 		int mib_idx;
 
 		if (before(seq, tp->rcv_nxt))
@@ -4110,13 +4112,15 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
 static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
 
 	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
 	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
 		tcp_enter_quickack_mode(sk);
 
-		if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+		if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
+		    !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
 			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
 
 			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
-- ...
From: Gilad Ben-Yossef
Date: Monday, October 26, 2009 - 1:06 am

Adding an accessor to existing  dst_entry feautres field and
refactor the only supported feature (allfrag) to use it.


Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/net/dst.h |    8 +++++++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index 5a900dd..b562be3 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -111,6 +111,12 @@ dst_metric(const struct dst_entry *dst, int metric)
 	return dst->metrics[metric-1];
 }
 
+static inline u32
+dst_feature(const struct dst_entry *dst, u32 feature)
+{
+	return dst_metric(dst, RTAX_FEATURES) & feature;
+}
+
 static inline u32 dst_mtu(const struct dst_entry *dst)
 {
 	u32 mtu = dst_metric(dst, RTAX_MTU);
@@ -136,7 +142,7 @@ static inline void set_dst_metric_rtt(struct dst_entry *dst, int metric,
 static inline u32
 dst_allfrag(const struct dst_entry *dst)
 {
-	int ret = dst_metric(dst, RTAX_FEATURES) & RTAX_FEATURE_ALLFRAG;
+	int ret = dst_feature(dst,  RTAX_FEATURE_ALLFRAG);
 	/* Yes, _exactly_. This is paranoia. */
 	barrier();
 	return ret;
-- 
1.5.6.3

--

From: Gilad Ben-Yossef
Date: Monday, October 26, 2009 - 1:06 am

Implement querying and acting upon the no timestamp bit in the feature 
field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/linux/rtnetlink.h |    2 +-
 net/ipv4/tcp_input.c      |    3 ++-
 net/ipv4/tcp_output.c     |    8 ++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 9c802a6..2ab8c75 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -378,7 +378,7 @@ enum
 
 #define RTAX_FEATURE_ECN	0x00000001
 #define RTAX_FEATURE_NO_SACK	0x00000002
-#define RTAX_FEATURE_TIMESTAMP	0x00000004
+#define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 
 struct rta_session
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b14f780..d2f9742 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3755,7 +3755,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 			case TCPOPT_TIMESTAMP:
 				if ((opsize == TCPOLEN_TIMESTAMP) &&
 				    ((estab && opt_rx->tstamp_ok) ||
-				     (!estab && sysctl_tcp_timestamps))) {
+				     (!estab && sysctl_tcp_timestamps &&
+				      !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP)))) {
 					opt_rx->saw_tstamp = 1;
 					opt_rx->rcv_tsval = get_unaligned_be32(ptr);
 					opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 64db8dd..8f30c18 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -488,7 +488,9 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 	opts->mss = tcp_advertise_mss(sk);
 	size += TCPOLEN_MSS_ALIGNED;
 
-	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
+	if (likely(sysctl_tcp_timestamps &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) &&
+		   *md5 == NULL)) {
 		opts->options |= OPTION_TS;
 		opts->tsval = ...
From: Gilad Ben-Yossef
Date: Monday, October 26, 2009 - 1:06 am

Since we only use tcp_parse_options here to check for the exietence
of TCP timestamp option in the header, it is better to call with
the "established" flag on.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Signed-off-by: Ori Finkelman <ori@comsleep.com>
Signed-off-by: Yony Amit <yony@comsleep.com>

---
 net/ipv4/tcp_minisocks.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 624c3c9..c49a550 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -100,9 +100,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 	struct tcp_options_received tmp_opt;
 	int paws_reject = 0;
 
-	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-		tcp_parse_options(skb, &tmp_opt, 0);
+		tcp_parse_options(skb, &tmp_opt, 1);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
-- 
1.5.6.3

--

From: William Allen Simpson
Date: Monday, October 26, 2009 - 9:56 am

Please explain how this patch is required for the other patches?

And more importantly, why it is better to call with established on?

And most importantly, what end cases you considered, and how this
interacts with the proposed rfc1323bis changes, especially on reset?
--

From: Gilad Ben-Yossef
Date: Wednesday, October 28, 2009 - 3:14 am

Hi William,


Gladly (and suggestions to do it differently are welcome) :

For the purpose of the patch tcp_parse_options was changed to consult 
dst_entry options when parsing non established packets.

This means that for any place that we call tcp_parse_options with the 
established parameter set to false, we need to supply it with a dst_entry.

In all other locations in kernel code when tcp_parse_options is called 
such a dst_entry is easily available already.

The time wait mini socket exists so that we would not waste resource 
keeping around the full socket state of a "real socket". As such, it 
does not cache the dst_entry. Adding it to the TIME_WAIT mini sockets 
jsut for this purpose defeats the purpose of having a mini socket in the 
first place.

One other possible way to go about it is to re-compute the dst_entry at 
this location, but this seemed an expensive operation to perform for 
what should be a light weight operation. I asked myself if there might 
be another way?

So I took a good look at the code and discovered that there is no need 
Sure. This is kind of long written down, although it's really simple. I 
will try to describe it as best I can.

Take a look at what tcp_parse_options() does as a function -

It has only one output: changing the fields of the tcp_options_received 
struct  which it gets a pointer to as a parameter. It also has a single 
side effect: it updates the  SKB TCP control block sacked field, if a 
SACK option is detected in the packet header.

Its behavior is dictated by the established parameter. If false, it will 
try to parse all supported TCP options, if found in the packet header. 
If true it will only try to parse the time stamp and SACK options.

Now take a look what happens at tcp_timewait_state_process() when we 
call tcp_parse_options() -

We allocate (on stack) a temporary tcp_options_received struct, and if 
our TIME_WAIT mini socket had recent timestamp data 
(tcptw->tw_ts_recent_stamp), we call ...
From: Gilad Ben-Yossef
Date: Monday, October 26, 2009 - 1:06 am

Implement querying and acting upon the no sack bit in the features
field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/linux/rtnetlink.h |    2 +-
 net/ipv4/tcp_input.c      |    3 ++-
 net/ipv4/tcp_output.c     |    4 +++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index adf2068..9c802a6 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -377,7 +377,7 @@ enum
 #define RTAX_MAX (__RTAX_MAX - 1)
 
 #define RTAX_FEATURE_ECN	0x00000001
-#define RTAX_FEATURE_SACK	0x00000002
+#define RTAX_FEATURE_NO_SACK	0x00000002
 #define RTAX_FEATURE_TIMESTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d502f49..b14f780 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3763,7 +3763,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 				break;
 			case TCPOPT_SACK_PERM:
 				if (opsize == TCPOLEN_SACK_PERM && th->syn &&
-				    !estab && sysctl_tcp_sack) {
+				    !estab && sysctl_tcp_sack &&
+				    !dst_feature(dst, RTAX_FEATURE_NO_SACK)) {
 					opt_rx->sack_ok = 1;
 					tcp_sack_reset(opt_rx);
 				}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fcd278a..64db8dd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -464,6 +464,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 				struct tcp_md5sig_key **md5) {
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned size = 0;
+	struct dst_entry *dst = __sk_dst_get(sk);
 
 #ifdef CONFIG_TCP_MD5SIG
 	*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -498,7 +499,8 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		opts->options |= OPTION_WSCALE;
 		size += TCPOLEN_WSCALE_ALIGNED;
 	}
-	if (likely(sysctl_tcp_sack)) ...
From: William Allen Simpson
Date: Monday, October 26, 2009 - 9:38 am

Please explain how this code turns SACK on when it is off globally?

As both Eric and I asked?
--

From: Gilad Ben-Yossef
Date: Wednesday, October 28, 2009 - 3:18 am

It doesn't. Please see my discussion with Eric for the why.

In short, doing so introduce a very subtle change to what the existing 
interface do today, which will break
backwards compatibility by changing the meaning of writing zero to the 
relevant sysctl. I don't want to be hunt down by angry sys admins :-)

Thanks,
Gilad




-- 
Gilad Ben-Yossef
Chief Coffee Drinker & CTO
Codefidence Ltd.

Web:   http://codefidence.com
Cell:  +972-52-8260388
Skype: gilad_codefidence
Tel:   +972-8-9316883 ext. 201
Fax:   +972-8-9316884
Email: gilad@codefidence.com

Check out our Open Source technology and training blog - http://tuxology.net

	"The biggest risk you can take it is to take no risk."
		-- Mark Zuckerberg and probably others

--

From: Gilad Ben-Yossef
Date: Monday, October 26, 2009 - 1:06 am

From: Gilad Ben-Yossef <gby@watson.codefidence.com>

We need tcp_parse_options to be aware of dst_entry to
take into account per dst_entry TCP options settings

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/net/tcp.h        |    3 ++-
 net/ipv4/syncookies.c    |   27 ++++++++++++++-------------
 net/ipv4/tcp_input.c     |    9 ++++++---
 net/ipv4/tcp_ipv4.c      |   21 ++++++++++++---------
 net/ipv4/tcp_minisocks.c |    7 +++++--
 net/ipv6/syncookies.c    |   28 +++++++++++++++-------------
 net/ipv6/tcp_ipv6.c      |    3 ++-
 7 files changed, 56 insertions(+), 42 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 03a49c7..740d09b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -409,7 +409,8 @@ extern int			tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
 
 extern void			tcp_parse_options(struct sk_buff *skb,
 						  struct tcp_options_received *opt_rx,
-						  int estab);
+						  int estab,
+						  struct dst_entry *dst);
 
 extern u8			*tcp_parse_md5sig_option(struct tcphdr *th);
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a6e0e07..4990dd4 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -276,13 +276,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
 
-	/* check for timestamp cookie support */
-	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, 0);
-
-	if (tcp_opt.saw_tstamp)
-		cookie_check_timestamp(&tcp_opt);
-
 	ret = NULL;
 	req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
 	if (!req)
@@ -298,12 +291,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	ireq->loc_addr		= ip_hdr(skb)->daddr;
 	ireq->rmt_addr		= ip_hdr(skb)->saddr;
 	ireq->ecn_ok		= 0;
-	ireq->snd_wscale	= tcp_opt.snd_wscale;
-	ireq->rcv_wscale	= ...
Previous thread: Re: VLAN and ARP failure on tg3 drivers by Gertjan Hofman on Sunday, October 25, 2009 - 9:30 pm. (10 messages)

Next thread: [PATCH net-next-2.6] net: sysfs: ethtool_ops can be NULL by Eric Dumazet on Monday, October 26, 2009 - 4:23 am. (3 messages)