Third iteration of patch to allow disablng of TCP SACK, DSCAK, time stamp and window scale TCP options on a per route basis, now with 100% less remote DoS opportunities (thank you Ilpo for spotting it ;-) You usualy want to disable SACK, DSACK, time stamp or window scale if you've got a piece of broken networking equipment somewhere as a stop gap until you can bring a big enough hammer to deal with the broken network equipment. It doesn't make sense to "punish" the entire connections going through the machine to destinations not related to the broken equipment. This is doubly true when you're dealing with network containers used to isolate several virtual domains. Per route options implemented in free bits in the features route entry property, which in some cases were reserved by name for these options, so this does not inflate any structure. Global sysctl based kill switches for these options are still preserved, as some people seems to want them, so behaviour is default to on, unless switched off either globaly or on per route basis. Tested on x86 using Qemu/KVM. Working but crude matching patch to iproute2 sent earlier to the list. Patchset based on original work by Ori Finkelman and Yony Amit from ComSleep Ltd. Gilad Ben-Yossef (7): Only parse time stamp TCP option in time wait sock Allow tcp_parse_options to consult dst entry Infrastructure for querying route entry features Add the no SACK route option feature Allow disabling TCP timestamp options per route Allow to turn off TCP window scale opt per route Allow disabling of DSACK TCP option per route include/linux/rtnetlink.h | 6 ++++-- include/net/dst.h | 8 +++++++- include/net/tcp.h | 3 ++- net/ipv4/syncookies.c | 27 ++++++++++++++------------- net/ipv4/tcp_input.c | 26 ++++++++++++++++++-------- net/ipv4/tcp_ipv4.c | 21 ++++++++++++--------- net/ipv4/tcp_minisocks.c | 8 +++++--- net/ipv4/tcp_output.c | 18 ...
Add and use no window scale bit in the features field.
Note that this is not the same as setting a window scale of 0
as would happen with window limit on route.
Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>
---
include/linux/rtnetlink.h | 1 +
net/ipv4/tcp_input.c | 3 ++-
net/ipv4/tcp_output.c | 6 ++++--
3 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 2ab8c75..6784b34 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -380,6 +380,7 @@ enum
#define RTAX_FEATURE_NO_SACK 0x00000002
#define RTAX_FEATURE_NO_TSTAMP 0x00000004
#define RTAX_FEATURE_ALLFRAG 0x00000008
+#define RTAX_FEATURE_NO_WSCALE 0x00000010
struct rta_session
{
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d2f9742..4f5e914 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3739,7 +3739,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
break;
case TCPOPT_WINDOW:
if (opsize == TCPOLEN_WINDOW && th->syn &&
- !estab && sysctl_tcp_window_scaling) {
+ !estab && sysctl_tcp_window_scaling &&
+ !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)) {
__u8 snd_wscale = *(__u8 *)ptr;
opt_rx->wscale_ok = 1;
if (snd_wscale > 14) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8f30c18..ff60a21 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -496,7 +496,8 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
opts->tsecr = tp->rx_opt.ts_recent;
size += TCPOLEN_TSTAMP_ALIGNED;
}
- if (likely(sysctl_tcp_window_scaling)) {
+ if (likely(sysctl_tcp_window_scaling &&
+ !dst_feature(dst, RTAX_FEATURE_NO_WSCALE))) {
opts->ws = tp->rx_opt.rcv_wscale;
opts->options |= OPTION_WSCALE;
size += ...Add and use no DSCAK bit in the features field.
Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>
---
include/linux/rtnetlink.h | 1 +
net/ipv4/tcp_input.c | 8 ++++++--
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 6784b34..e78b60c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -381,6 +381,7 @@ enum
#define RTAX_FEATURE_NO_TSTAMP 0x00000004
#define RTAX_FEATURE_ALLFRAG 0x00000008
#define RTAX_FEATURE_NO_WSCALE 0x00000010
+#define RTAX_FEATURE_NO_DSACK 0x00000020
struct rta_session
{
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4f5e914..4262da5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4080,8 +4080,10 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
- if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+ if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
+ !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
int mib_idx;
if (before(seq, tp->rcv_nxt))
@@ -4110,13 +4112,15 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_enter_quickack_mode(sk);
- if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+ if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
+ !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
-- ...Adding an accessor to existing dst_entry feautres field and
refactor the only supported feature (allfrag) to use it.
Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>
---
include/net/dst.h | 8 +++++++-
1 files changed, 7 insertions(+), 1 deletions(-)
diff --git a/include/net/dst.h b/include/net/dst.h
index 5a900dd..b562be3 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -111,6 +111,12 @@ dst_metric(const struct dst_entry *dst, int metric)
return dst->metrics[metric-1];
}
+static inline u32
+dst_feature(const struct dst_entry *dst, u32 feature)
+{
+ return dst_metric(dst, RTAX_FEATURES) & feature;
+}
+
static inline u32 dst_mtu(const struct dst_entry *dst)
{
u32 mtu = dst_metric(dst, RTAX_MTU);
@@ -136,7 +142,7 @@ static inline void set_dst_metric_rtt(struct dst_entry *dst, int metric,
static inline u32
dst_allfrag(const struct dst_entry *dst)
{
- int ret = dst_metric(dst, RTAX_FEATURES) & RTAX_FEATURE_ALLFRAG;
+ int ret = dst_feature(dst, RTAX_FEATURE_ALLFRAG);
/* Yes, _exactly_. This is paranoia. */
barrier();
return ret;
--
1.5.6.3
--
Implement querying and acting upon the no timestamp bit in the feature
field.
Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>
---
include/linux/rtnetlink.h | 2 +-
net/ipv4/tcp_input.c | 3 ++-
net/ipv4/tcp_output.c | 8 ++++++--
3 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 9c802a6..2ab8c75 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -378,7 +378,7 @@ enum
#define RTAX_FEATURE_ECN 0x00000001
#define RTAX_FEATURE_NO_SACK 0x00000002
-#define RTAX_FEATURE_TIMESTAMP 0x00000004
+#define RTAX_FEATURE_NO_TSTAMP 0x00000004
#define RTAX_FEATURE_ALLFRAG 0x00000008
struct rta_session
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b14f780..d2f9742 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3755,7 +3755,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
case TCPOPT_TIMESTAMP:
if ((opsize == TCPOLEN_TIMESTAMP) &&
((estab && opt_rx->tstamp_ok) ||
- (!estab && sysctl_tcp_timestamps))) {
+ (!estab && sysctl_tcp_timestamps &&
+ !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP)))) {
opt_rx->saw_tstamp = 1;
opt_rx->rcv_tsval = get_unaligned_be32(ptr);
opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 64db8dd..8f30c18 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -488,7 +488,9 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
opts->mss = tcp_advertise_mss(sk);
size += TCPOLEN_MSS_ALIGNED;
- if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
+ if (likely(sysctl_tcp_timestamps &&
+ !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) &&
+ *md5 == NULL)) {
opts->options |= OPTION_TS;
opts->tsval = ...Since we only use tcp_parse_options here to check for the exietence
of TCP timestamp option in the header, it is better to call with
the "established" flag on.
Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Signed-off-by: Ori Finkelman <ori@comsleep.com>
Signed-off-by: Yony Amit <yony@comsleep.com>
---
net/ipv4/tcp_minisocks.c | 3 +--
1 files changed, 1 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 624c3c9..c49a550 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -100,9 +100,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
struct tcp_options_received tmp_opt;
int paws_reject = 0;
- tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
- tcp_parse_options(skb, &tmp_opt, 0);
+ tcp_parse_options(skb, &tmp_opt, 1);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = tcptw->tw_ts_recent;
--
1.5.6.3
--
Please explain how this patch is required for the other patches? And more importantly, why it is better to call with established on? And most importantly, what end cases you considered, and how this interacts with the proposed rfc1323bis changes, especially on reset? --
Hi William, Gladly (and suggestions to do it differently are welcome) : For the purpose of the patch tcp_parse_options was changed to consult dst_entry options when parsing non established packets. This means that for any place that we call tcp_parse_options with the established parameter set to false, we need to supply it with a dst_entry. In all other locations in kernel code when tcp_parse_options is called such a dst_entry is easily available already. The time wait mini socket exists so that we would not waste resource keeping around the full socket state of a "real socket". As such, it does not cache the dst_entry. Adding it to the TIME_WAIT mini sockets jsut for this purpose defeats the purpose of having a mini socket in the first place. One other possible way to go about it is to re-compute the dst_entry at this location, but this seemed an expensive operation to perform for what should be a light weight operation. I asked myself if there might be another way? So I took a good look at the code and discovered that there is no need Sure. This is kind of long written down, although it's really simple. I will try to describe it as best I can. Take a look at what tcp_parse_options() does as a function - It has only one output: changing the fields of the tcp_options_received struct which it gets a pointer to as a parameter. It also has a single side effect: it updates the SKB TCP control block sacked field, if a SACK option is detected in the packet header. Its behavior is dictated by the established parameter. If false, it will try to parse all supported TCP options, if found in the packet header. If true it will only try to parse the time stamp and SACK options. Now take a look what happens at tcp_timewait_state_process() when we call tcp_parse_options() - We allocate (on stack) a temporary tcp_options_received struct, and if our TIME_WAIT mini socket had recent timestamp data (tcptw->tw_ts_recent_stamp), we call ...
Implement querying and acting upon the no sack bit in the features
field.
Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>
---
include/linux/rtnetlink.h | 2 +-
net/ipv4/tcp_input.c | 3 ++-
net/ipv4/tcp_output.c | 4 +++-
3 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index adf2068..9c802a6 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -377,7 +377,7 @@ enum
#define RTAX_MAX (__RTAX_MAX - 1)
#define RTAX_FEATURE_ECN 0x00000001
-#define RTAX_FEATURE_SACK 0x00000002
+#define RTAX_FEATURE_NO_SACK 0x00000002
#define RTAX_FEATURE_TIMESTAMP 0x00000004
#define RTAX_FEATURE_ALLFRAG 0x00000008
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d502f49..b14f780 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3763,7 +3763,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
break;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM && th->syn &&
- !estab && sysctl_tcp_sack) {
+ !estab && sysctl_tcp_sack &&
+ !dst_feature(dst, RTAX_FEATURE_NO_SACK)) {
opt_rx->sack_ok = 1;
tcp_sack_reset(opt_rx);
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fcd278a..64db8dd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -464,6 +464,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
struct tcp_md5sig_key **md5) {
struct tcp_sock *tp = tcp_sk(sk);
unsigned size = 0;
+ struct dst_entry *dst = __sk_dst_get(sk);
#ifdef CONFIG_TCP_MD5SIG
*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -498,7 +499,8 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
opts->options |= OPTION_WSCALE;
size += TCPOLEN_WSCALE_ALIGNED;
}
- if (likely(sysctl_tcp_sack)) ...Please explain how this code turns SACK on when it is off globally? As both Eric and I asked? --
It doesn't. Please see my discussion with Eric for the why. In short, doing so introduce a very subtle change to what the existing interface do today, which will break backwards compatibility by changing the meaning of writing zero to the relevant sysctl. I don't want to be hunt down by angry sys admins :-) Thanks, Gilad -- Gilad Ben-Yossef Chief Coffee Drinker & CTO Codefidence Ltd. Web: http://codefidence.com Cell: +972-52-8260388 Skype: gilad_codefidence Tel: +972-8-9316883 ext. 201 Fax: +972-8-9316884 Email: gilad@codefidence.com Check out our Open Source technology and training blog - http://tuxology.net "The biggest risk you can take it is to take no risk." -- Mark Zuckerberg and probably others --
From: Gilad Ben-Yossef <gby@watson.codefidence.com> We need tcp_parse_options to be aware of dst_entry to take into account per dst_entry TCP options settings Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com> Sigend-off-by: Ori Finkelman <ori@comsleep.com> Sigend-off-by: Yony Amit <yony@comsleep.com> --- include/net/tcp.h | 3 ++- net/ipv4/syncookies.c | 27 ++++++++++++++------------- net/ipv4/tcp_input.c | 9 ++++++--- net/ipv4/tcp_ipv4.c | 21 ++++++++++++--------- net/ipv4/tcp_minisocks.c | 7 +++++-- net/ipv6/syncookies.c | 28 +++++++++++++++------------- net/ipv6/tcp_ipv6.c | 3 ++- 7 files changed, 56 insertions(+), 42 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 03a49c7..740d09b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -409,7 +409,8 @@ extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, extern void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, - int estab); + int estab, + struct dst_entry *dst); extern u8 *tcp_parse_md5sig_option(struct tcphdr *th); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index a6e0e07..4990dd4 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -276,13 +276,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); - /* check for timestamp cookie support */ - memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, 0); - - if (tcp_opt.saw_tstamp) - cookie_check_timestamp(&tcp_opt); - ret = NULL; req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ if (!req) @@ -298,12 +291,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->loc_addr = ip_hdr(skb)->daddr; ireq->rmt_addr = ip_hdr(skb)->saddr; ireq->ecn_ok = 0; - ireq->snd_wscale = tcp_opt.snd_wscale; - ireq->rcv_wscale = ...
