diff -ruN linux-2.6.26/Documentation/networking/ip-sysctl.txt linux-2.6.26-MTCP-development/Documentation/networking/ip-sysctl.txt --- linux-2.6.26/Documentation/networking/ip-sysctl.txt 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/Documentation/networking/ip-sysctl.txt 2009-03-03 22:04:09.000000000 +0100 @@ -211,6 +211,15 @@ to the values prior timeout Default: 0 (rate halving based) +tcp_init_ssthresh - INTEGER + Number of segments for the initial SSTHRESH value + Default: 0 (=MAXINT) + +tcp_initial_bw_est - BOOLEAN + 0: Inital Hoe/Allman Bandwidth estimation off + 1: Inital Hoe/Allman Bandwidth estimation on + Default: 0 + tcp_keepalive_time - INTEGER How often TCP sends out keepalive messages when keepalive is enabled. Default: 2hours. @@ -302,6 +311,32 @@ you should think about lowering this value, such sockets may consume significant resources. Cf. tcp_max_orphans. +tcp_pacing - BOOLEAN + 0: Send each window as a burst of segments + 1: Spread segments all across RTT + Deafult: 0 + +tcp_as_nrtt - INTEGER + Period of Adaptive-Selection express in RTT + Default: 10 + +tcp_as_alg - INTEGER + 0: Enable algorithm with bic if b==0, reno hybla and highspeed adaptive-sel + 1: Enable algorithm with bic if b==0, reno hybla and htcp adaptvie-sel + 2: Enable algorithm with bic if b==0, reno hybla and bic adaptive-sel + Default: 0 + +tcp_as_mode - INTEGER + If Adaptive-Selection congestion control is off, no effect. + 0: Enable TCP Static Adaptive-Selection, with one-shot selection based on BDP + 1: Enable TCP Dynamic Adaptive-Selection, with dynamic selection every nrtt based on BDP + Default: 0 + +tcp_as_debug - BOOLEAN + 0: Enable TCP Static Adaptive-Selection debug + 1: Enable TCP Dynamic Adaptive-Selection debug + Default: 0 + tcp_reordering - INTEGER Maximal reordering of packets in a TCP stream. Default: 3 @@ -416,6 +451,16 @@ It should not be changed without advice/request of technical experts. +tcp_uob_dbg - INTEGER + 0: Doesn't send informations about tcp segments to klogd + 1: Sends informations about tcp segments to klogd + 2: Enable "Achieved Rate" samples if supported by the kernel + +tcp_achieved_rate_delta - INTEGER + Interval between samples for "Achieved Rate" estimation + 0: collect one sample per RTT + *: specify interval between samples (in milliseconds) + tcp_window_scaling - BOOLEAN Enable window scaling as defined in RFC1323. diff -ruN linux-2.6.26/include/linux/sysctl.h linux-2.6.26-MTCP-development/include/linux/sysctl.h --- linux-2.6.26/include/linux/sysctl.h 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/include/linux/sysctl.h 2009-03-03 22:04:01.000000000 +0100 @@ -435,6 +435,28 @@ NET_TCP_ALLOWED_CONG_CONTROL=123, NET_TCP_MAX_SSTHRESH=124, NET_TCP_FRTO_RESPONSE=125, + +#ifdef CONFIG_TCP_MULTITCP + NET_TCP_UOB_DBG=130, +#ifdef CONFIG_TCP_PACING + NET_TCP_PACING=131, + NET_TCP_INITIAL_BW_EST=132, +#endif + NET_TCP_INIT_SSTHRESH=133, + NET_TCP_RTO_MAX=134, + +#ifdef CONFIG_TCP_CONG_AS + NET_TCP_AS_ALG=140, + NET_TCP_AS_DEBUG=141, + NET_TCP_AS_MODE=142, + NET_TCP_AS_NRTT=143, +#endif +#ifdef CONFIG_TCP_ACHIEVED_RATE + NET_TCP_ACHIEVED_RATE_DELTA=150, + NET_TCP_ACHIEVED_RATE=151, +#endif +#endif + }; enum { diff -ruN linux-2.6.26/include/linux/tcp.h linux-2.6.26-MTCP-development/include/linux/tcp.h --- linux-2.6.26/include/linux/tcp.h 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/include/linux/tcp.h 2009-03-03 22:04:01.000000000 +0100 @@ -376,6 +376,48 @@ unsigned long last_synq_overflow; + u8 may_keep_cwnd; /* while in recovery, a new ack keeps existing window in tcp_cwnd_down*/ +#ifdef CONFIG_TCP_MULTITCP + struct { + unsigned long zerotime; + __u32 ack0; + __u32 snt0; + __u32 snt; + __u32 ack; + __u32 data; +#ifdef CONFIG_TCP_ACHIEVED_RATE + struct { + unsigned long time0; + __u32 ack0; + __u32 delta; + __u32 rate; + __u8 bad_sample; + } achieved; +#endif + } counter; +#endif + +#ifdef CONFIG_TCP_PACING + struct { + unsigned long time_snd; + __u32 seq1; + __u32 seq2; + unsigned long time_ack1; + unsigned long time_ack2; + __u32 bdp; + } initial_bw_est; + + struct { + struct timer_list timer; + __u16 count; + __u16 burst; + __u8 lock; + __u16 delta:15, + disabled:1; + __u16 sent; + } pacing; +#endif + u32 tso_deferred; /* Receiver side RTT estimation */ diff -ruN linux-2.6.26/include/net/inet_connection_sock.h linux-2.6.26-MTCP-development/include/net/inet_connection_sock.h --- linux-2.6.26/include/net/inet_connection_sock.h 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/include/net/inet_connection_sock.h 2009-03-03 22:03:56.000000000 +0100 @@ -123,7 +123,13 @@ /* Information on the current probe. */ int probe_size; } icsk_mtup; + +#ifdef CONFIG_TCP_CONG_AS + u32 icsk_ca_priv[19]; +#else u32 icsk_ca_priv[16]; +#endif + #define ICSK_CA_PRIV_SIZE (16 * sizeof(u32)) }; diff -ruN linux-2.6.26/include/net/tcp.h linux-2.6.26-MTCP-development/include/net/tcp.h --- linux-2.6.26/include/net/tcp.h 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/include/net/tcp.h 2009-03-03 22:03:56.000000000 +0100 @@ -123,7 +123,7 @@ #define TCP_DELACK_MIN 4U #define TCP_ATO_MIN 4U #endif -#define TCP_RTO_MAX ((unsigned)(120*HZ)) +#define TCP_RTO_MAX ((unsigned)(sysctl_tcp_rto_max*HZ)) #define TCP_RTO_MIN ((unsigned)(HZ/5)) #define TCP_TIMEOUT_INIT ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value */ @@ -234,6 +234,113 @@ extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_max_ssthresh; +extern int sysctl_tcp_rto_max; + +#ifdef CONFIG_TCP_MULTITCP +extern int sysctl_tcp_uob_dbg; +extern int sysctl_tcp_init_ssthresh; +extern void __multitcp_ack_event(struct sock *sk, u32 ack); +extern void __multitcp_cwnd_event(struct sock *sk, const char *caller); +extern void __multitcp_cwnd_log(struct tcp_sock *tp, const char *caller); +extern void __multitcp_snd_event(struct sock *sk, u32 seq); +extern void __debug_snd_event(struct tcp_sock *tp, u32 seq); + +static inline void multitcp_ack_event(struct sock *sk, u32 ack){ + __multitcp_ack_event(sk,ack); +} + +static inline void multitcp_cwnd_event(struct sock *sk, const char *caller){ + __multitcp_cwnd_event(sk, caller); +} + +static inline void multitcp_cwnd_log(struct tcp_sock *tp, const char *caller){ + __multitcp_cwnd_log(tp, caller); +} +static inline void multitcp_snd_event(struct sock *sk, u32 seq){ + __multitcp_snd_event(sk, seq); +} +#else +static inline void multitcp_ack_event(struct sock *sk, u32 ack) {}; +static inline void multitcp_cwnd_event(struct sock *sk, const char *caller) {}; +static inline void multitcp_cwnd_log(struct sock *sk, const char *caller) {}; +static inline void multitcp_snd_event(struct sock *sk, u32 seq){}; +#endif + +extern int tcp_should_est(struct sock *sk); + +#ifdef CONFIG_TCP_PACING +extern int sysctl_tcp_pacing; +extern int sysctl_tcp_initial_bw_est; +extern void __tcp_pacing_recalc_delta(struct sock *sk); +extern void __tcp_pacing_reset_timer(struct sock *sk); +extern void __initial_bw_est(struct sock *sk, u32 ack); +extern void __set_be_endpoint(struct tcp_sock *tp, u32 seq); + +static inline int tcp_pacing_enabled(struct sock *sk) +{ + return (sysctl_tcp_pacing && !tcp_sk(sk)->pacing.disabled && !tcp_should_est(sk)); +} + +static inline void tcp_pacing_recalc_delta(struct sock *sk) +{ + if (tcp_pacing_enabled(sk)) + __tcp_pacing_recalc_delta(sk); +} + +static inline void tcp_pacing_reset_timer(struct sock *sk) +{ + if (tcp_pacing_enabled(sk)) + __tcp_pacing_reset_timer(sk); +} + +static inline void tcp_pacing_lock_tx(struct sock *sk) +{ + if (tcp_pacing_enabled(sk)) + tcp_sk(sk)->pacing.lock=1; +} + +static inline int tcp_pacing_locked(struct sock *sk) +{ + if (tcp_pacing_enabled(sk)) + return tcp_sk(sk)->pacing.lock; + else + return 0; +} + +static inline int tcp_pacing_burst(struct sock *sk) +{ + if (tcp_pacing_enabled(sk)) + return (max_t(u32,1,tcp_sk(sk)->pacing.burst)); + else + return 0; +} + +static inline void tcp_initial_bw_est(struct sock *sk, u32 ack){ + if (sysctl_tcp_initial_bw_est && tcp_sk(sk)->initial_bw_est.bdp == 0) + __initial_bw_est(sk,ack); + return; +} + +static inline void tcp_set_be_endpoint(struct tcp_sock *tp, u32 seq){ + if(sysctl_tcp_initial_bw_est && tp->initial_bw_est.bdp == 0) + __set_be_endpoint(tp, seq); +} + +#else +static inline void tcp_pacing_recalc_delta(struct sock *sk) {}; +static inline void tcp_pacing_reset_timer(struct sock *sk) {}; +static inline void tcp_pacing_lock_tx(struct sock *sk) {}; + +static inline void tcp_initial_bw_est(struct sock *sk, u32 ack) {}; +static inline void tcp_set_be_endpoint(struct tcp_sock *tp, u32 seq) {}; +#define tcp_pacing_locked(sk) 0 +#define tcp_pacing_enabled(sk) 0 +#define tcp_pacing_burst(sk) 0 +#endif +#ifdef CONFIG_TCP_ACHIEVED_RATE +extern int sysctl_tcp_achieved_rate_delta; +extern int sysctl_tcp_achieved_rate; +#endif extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; @@ -668,6 +775,9 @@ void (*pkts_acked)(struct sock *sk, u32 num_acked, s32 rtt_us); /* get info for inet_diag (optional) */ void (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb); + + void (*moderate_cwnd)(struct sock *sk, struct tcp_sock *tp); + char name[TCP_CA_NAME_MAX]; struct module *owner; @@ -1390,5 +1500,27 @@ extern void tcp_v4_init(void); extern void tcp_init(void); +extern void tcp_check_space(struct sock *); + +#ifdef CONFIG_TCP_PACING +extern void tcp_pacing_recalc_delta (struct sock *); +extern void tcp_reset_pacing_timer (struct sock *); +#endif + + +#ifdef CONFIG_TCP_CONG_AS +extern int sysctl_tcp_as_alg; +extern int sysctl_tcp_as_debug; +extern int sysctl_tcp_as_mode; +extern int sysctl_tcp_as_nrtt; + +extern struct tcp_congestion_ops *tcp_tas_select_ca_ops(char*,int*); +struct tcp_tas { + u32 cong_alg_ca_priv[16]; + u32 last_check; + u32 minrtt; + struct tcp_congestion_ops *ca_ops; +}; +#endif #endif /* _TCP_H */ diff -ruN linux-2.6.26/kernel/sysctl_check.c linux-2.6.26-MTCP-development/kernel/sysctl_check.c --- linux-2.6.26/kernel/sysctl_check.c 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/kernel/sysctl_check.c 2009-03-03 22:04:42.000000000 +0100 @@ -389,6 +389,25 @@ { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, +#ifdef CONFIG_TCP_MULTITCP + { NET_TCP_UOB_DBG, "tcp_uob_dbg" }, +#ifdef CONFIG_TCP_PACING + { NET_TCP_PACING, "tcp_pacing" }, + { NET_TCP_INITIAL_BW_EST, "tcp_initial_bw_est" }, +#endif + { NET_TCP_INIT_SSTHRESH , "tcp_init_ssthresh" }, + { NET_TCP_RTO_MAX , "tcp_rto_max" }, +#ifdef CONFIG_TCP_CONG_AS + { NET_TCP_AS_ALG , "tcp_as_alg" }, + { NET_TCP_AS_DEBUG , "tcp_as_debug" }, + { NET_TCP_AS_MODE , "tcp_as_mode" }, + { NET_TCP_AS_NRTT , "tcp_as_nrtt" }, +#endif +#ifdef CONFIG_TCP_ACHIEVED_RATE + { NET_TCP_ACHIEVED_RATE_DELTA , "tcp_achieved_rate_delta" }, + { NET_TCP_ACHIEVED_RATE , "tcp_achieved_rate" }, +#endif +#endif { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, {} }; diff -ruN linux-2.6.26/net/ipv4/Kconfig linux-2.6.26-MTCP-development/net/ipv4/Kconfig --- linux-2.6.26/net/ipv4/Kconfig 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/net/ipv4/Kconfig 2009-03-03 22:03:42.000000000 +0100 @@ -576,6 +576,31 @@ For further details see: http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html +menu "TCP Adaptive Selection Env" + +config TCP_CONG_AS + bool "TCP Adaptive Selection Env" + depends on EXPERIMENTAL + default n + ---help--- + TCP Adaptive Selection Congestion Control Env. + +config TCP_CONG_AS_MOD + tristate "TCP Adaptive Selection Module" + depends on TCP_CONG_AS + select TCP_CONG_HYBLA + select TCP_CONG_HSTCP + select TCP_CONG_WESTWOOD + select TCP_CONG_BIC + select TCP_CONG_HTCP + select TCP_CONG_CUBIC + select TCP_CONG_VEGAS + default n + ---help--- + TCP Adaptive Selection Congestion Control Core. + +endmenu + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -632,5 +657,28 @@ If unsure, say N. +menu "MultiTCP Test Environment" +config TCP_MULTITCP + depends on EXPERIMENTAL + bool "MultiTCP Advanced TCP protocol test environment" + +config TCP_ACHIEVED_RATE + depends on TCP_MULTITCP + default n + bool "TCP 'Achieved Rate' bandwidth estimation" + ---help--- + Calculate available bandwidth for tcp connection for statistical uses. + To enable it use the value "2" in sysctl net.ipv4.tcp_uob_dbg. + +config TCP_PACING + depends on TCP_MULTITCP + default n + bool "TCP Spacing and bandwidth estimation" + ---help--- + TCP spacing and Initial BW estimation: a couple of tiny TCP enhancements that + could improve performance in long-delay channels. + See Documentation/networking/ip-sysctl.txt for how to enable them. +endmenu + source "net/ipv4/ipvs/Kconfig" diff -ruN linux-2.6.26/net/ipv4/Makefile linux-2.6.26-MTCP-development/net/ipv4/Makefile --- linux-2.6.26/net/ipv4/Makefile 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/net/ipv4/Makefile 2009-03-03 22:03:41.000000000 +0100 @@ -49,6 +49,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o +obj-$(CONFIG_TCP_CONG_AS_MOD) += tcp_as.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff -ruN linux-2.6.26/net/ipv4/sysctl_net_ipv4.c linux-2.6.26-MTCP-development/net/ipv4/sysctl_net_ipv4.c --- linux-2.6.26/net/ipv4/sysctl_net_ipv4.c 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/net/ipv4/sysctl_net_ipv4.c 2009-03-03 22:03:42.000000000 +0100 @@ -419,7 +419,6 @@ .mode = 0644, .proc_handler = &proc_dointvec }, - #endif { .ctl_name = NET_IPV4_IGMP_MAX_MSF, @@ -753,6 +752,102 @@ .strategy = &sysctl_intvec, .extra1 = &zero }, +#ifdef CONFIG_TCP_MULTITCP + { + .ctl_name = NET_TCP_UOB_DBG, + .procname = "tcp_uob_dbg", + .data = &sysctl_tcp_uob_dbg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_TCP_PACING + { + .ctl_name = NET_TCP_PACING, + .procname = "tcp_pacing", + .data = &sysctl_tcp_pacing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_INITIAL_BW_EST, + .procname = "tcp_initial_bw_est", + .data = &sysctl_tcp_initial_bw_est, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = NET_TCP_INIT_SSTHRESH, + .procname = "tcp_init_ssthresh", + .data = &sysctl_tcp_init_ssthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_RTO_MAX, + .procname = "tcp_rto_max", + .data = &sysctl_tcp_rto_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_TCP_CONG_AS + { + .ctl_name = NET_TCP_AS_ALG, + .procname = "tcp_as_alg", + .data = &sysctl_tcp_as_alg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_AS_DEBUG, + .procname = "tcp_as_debug", + .data = &sysctl_tcp_as_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_AS_MODE, + .procname = "tcp_as_mode", + .data = &sysctl_tcp_as_mode, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_AS_NRTT, + .procname = "tcp_as_nrtt", + .data = &sysctl_tcp_as_nrtt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_TCP_ACHIEVED_RATE + { + .ctl_name = NET_TCP_ACHIEVED_RATE_DELTA, + .procname = "tcp_achieved_rate_delta", + .data = &sysctl_tcp_achieved_rate_delta, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_ACHIEVED_RATE, + .procname = "tcp_achieved_rate", + .data = &sysctl_tcp_achieved_rate, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; diff -ruN linux-2.6.26/net/ipv4/tcp_as.c linux-2.6.26-MTCP-development/net/ipv4/tcp_as.c --- linux-2.6.26/net/ipv4/tcp_as.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.26-MTCP-development/net/ipv4/tcp_as.c 2009-03-03 22:03:42.000000000 +0100 @@ -0,0 +1,358 @@ +/* + * TCP Adaptive Selection + * + * TCP-AS Congestion control algorithm + * + */ + +#include +#include +#include +#include + +//extern int sysctl_tcp_as_debug; +//extern int sysctl_tcp_as_mode; +//extern int sysctl_tcp_as_nrtt; +//extern int sysctl_tcp_as_alg; + +void tas_select_module_bdp(struct sock *sk, char *cc_alg) +{ + struct tcp_tas *ca = (void *)inet_csk(sk)->icsk_ca_priv; + struct tcp_sock *tp = tcp_sk(sk); + u64 b=0; + memset(cc_alg,0,TCP_CA_NAME_MAX); + strncpy(cc_alg,ca->ca_ops->name, TCP_CA_NAME_MAX); + +#ifdef CONFIG_TCP_PACING + /* b value of Bandwidth in bps */ + b=tp->initial_bw_est.bdp*(tp->mss_cache<<3)*1000/ca->minrtt; + if (sysctl_tcp_as_debug) + printk("----------TCP_AS LOG ----------- (Select Module) Bandwidth b = %llu bps, bdp = %u B/ms\n",b,tp->initial_bw_est.bdp); +#endif + switch (sysctl_tcp_as_alg){ + case 0: + if (b==0) { + snprintf(cc_alg,TCP_CA_NAME_MAX,"bic"); + break; + } + if (ca->minrtt<=200){ + if (b<=10000000){ + snprintf(cc_alg,TCP_CA_NAME_MAX,"reno"); + }else { + snprintf(cc_alg,TCP_CA_NAME_MAX,"highspeed"); + } + } else { + if (b<=1000000) { + snprintf(cc_alg,TCP_CA_NAME_MAX,"reno"); + } else + snprintf(cc_alg,TCP_CA_NAME_MAX,"hybla"); + } + break; + + case 1: + if (b==0) { + snprintf(cc_alg,TCP_CA_NAME_MAX,"bic"); + break; + } + if (ca->minrtt<=200){ + if (b<=10000000){ + snprintf(cc_alg,TCP_CA_NAME_MAX,"reno"); + }else { + snprintf(cc_alg,TCP_CA_NAME_MAX,"htcp"); + } + } else { + if (b<=1000000) { + snprintf(cc_alg,TCP_CA_NAME_MAX,"reno"); + } else + snprintf(cc_alg,TCP_CA_NAME_MAX,"hybla"); + } + break; + case 2: + if (b==0) { + snprintf(cc_alg,TCP_CA_NAME_MAX,"bic"); + break; + } + if (ca->minrtt<=200){ + if (b<=10000000){ + snprintf(cc_alg,TCP_CA_NAME_MAX,"reno"); + }else { + snprintf(cc_alg,TCP_CA_NAME_MAX,"bic"); + } + } else { + if (b<=1000000) { + snprintf(cc_alg,TCP_CA_NAME_MAX,"reno"); + } else + snprintf(cc_alg,TCP_CA_NAME_MAX,"hybla"); + } + break; + } +} + + +static void tas_change_ca(struct sock *sk) +{ + u32 ssthresh=0; + u32 cwnd=0; + int err=0; + struct tcp_tas *ca = (struct tcp_tas *)(inet_csk(sk)->icsk_ca_priv); + struct tcp_sock *tp = tcp_sk(sk); + char name[TCP_CA_NAME_MAX]; + + if (sysctl_tcp_as_debug){ +#ifdef CONFIG_TCP_PACING + printk("----------TCP_AS LOG ----------- (Change CA) BDP: %u RTT min: %u MODE %d\n",tp->initial_bw_est.bdp,ca->minrtt,sysctl_tcp_as_mode); +#endif + } + + tas_select_module_bdp(sk, name); + + if (strcmp(name,ca->ca_ops->name)!= 0 ) { + struct tcp_congestion_ops *cops; + ssthresh = tp->snd_ssthresh; + cwnd = tp->snd_cwnd; + if (sysctl_tcp_as_debug) + printk("----------TCP_AS LOG ----------- (Change CA) Changing from %s to %s\n",ca->ca_ops->name,name); + cops = tcp_tas_select_ca_ops(name,&err); + if (cops && !err) { + /* Switch to Hybla...enable pacing! */ + if (strcmp(name,"hybla")==0) + tcp_sk(sk)->pacing.disabled=0; + /* Switch from Hybla...disable pacing! */ + if (strcmp(ca->ca_ops->name,"hybla")==0) + tcp_sk(sk)->pacing.disabled=1; + if (ca->ca_ops->release) + ca->ca_ops->release(sk); + module_put(ca->ca_ops->owner); + ca->ca_ops=cops; + if (sysctl_tcp_as_debug) + printk("----------TCP_AS LOG ----------- (Change CA) Changed to %s\n",ca->ca_ops->name); + printk(KERN_DEBUG "tcp_ca: %u, %lu, %u, %u, %u, %u, %s, as selected %s\n",tp->counter.snt0,jiffies-tp->counter.zerotime, tp->snd_cwnd, tp->snd_ssthresh, tp->snd_cwnd_clamp, inet_csk(sk)->icsk_ca_state, __FUNCTION__, ca->ca_ops->name); + + + if (ca->ca_ops->init) + ca->ca_ops->init(sk); + tp->snd_ssthresh=ssthresh; + tp->snd_cwnd=cwnd; + } + } +} + + +static void tas_init(struct sock *sk) +{ + + struct tcp_tas *ca = (struct tcp_tas *)inet_csk(sk)->icsk_ca_priv; + struct tcp_sock *tp = tcp_sk(sk); + struct inet_sock *isk = inet_sk(sk); + char name[TCP_CA_NAME_MAX]; + int err=0; + + if (sysctl_tcp_as_debug) + printk("----------TCP_AS LOG ----------- Connection from ip %d.%d.%d.%d on port %d\n",NIPQUAD(isk->daddr),be16_to_cpu(isk->sport)); + ca->minrtt=tp->srtt>>3; + ca->last_check=jiffies; + tcp_sk(sk)->pacing.disabled=1; + snprintf(name,TCP_CA_NAME_MAX,"reno"); + ca->ca_ops=tcp_tas_select_ca_ops(name,&err); + if (!ca->ca_ops) + ca->ca_ops=&tcp_init_congestion_ops; + if (ca->ca_ops->init && !err) + ca->ca_ops->init(sk); + if (sysctl_tcp_as_debug) + printk("----------TCP_AS LOG ----------- ** Init new connection ** \n----------TCP_AS LOG ----------- inital congestion control tcp_%s, MODE %d\n",ca->ca_ops->name,sysctl_tcp_as_mode); + printk(KERN_DEBUG "tcp_ca: %u, %lu, %u, %u, %u, %u, %s, as selected %s\n",tp->counter.snt0,jiffies-tp->counter.zerotime, tp->snd_cwnd, tp->snd_ssthresh, tp->snd_cwnd_clamp, inet_csk(sk)->icsk_ca_state, __FUNCTION__, ca->ca_ops->name); +} + +static void tas_release(struct sock *sk) { + struct tcp_tas *ca = (struct tcp_tas *)inet_csk(sk)->icsk_ca_priv; + if (ca->ca_ops->release) + ca->ca_ops->release(sk); + tcp_sk(sk)->pacing.disabled=0; + if (sysctl_tcp_as_debug) + printk("----------TCP_AS LOG ----------- ** Release connection **\n----------TCP_AS LOG ----------- final congestion control tcp_%s\n",ca->ca_ops->name); + module_put(ca->ca_ops->owner); +} + + +static u32 tas_ssthresh(struct sock *sk) { + struct tcp_tas *ca =(struct tcp_tas *)inet_csk(sk)->icsk_ca_priv; + return ca->ca_ops->ssthresh(sk); +} + +static u32 tas_min_cwnd(const struct sock *sk) +{ + struct tcp_tas *ca = (struct tcp_tas *)inet_csk(sk)->icsk_ca_priv; + return ca->ca_ops->min_cwnd ? ca->ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; +} + +static void tas_state(struct sock *sk, u8 ca_state) +{ + struct tcp_tas *ca = (struct tcp_tas *)inet_csk(sk)->icsk_ca_priv; + if (ca->ca_ops->set_state) + ca->ca_ops->set_state(sk,ca_state); +} + +static void tas_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt) +{ + struct tcp_tas *ca = (struct tcp_tas *)inet_csk(sk)->icsk_ca_priv; + if (ca->ca_ops->pkts_acked) + ca->ca_ops->pkts_acked(sk,num_acked, rtt); +} + +static u32 tas_undo_cwnd(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_tas *ca = (struct tcp_tas *)inet_csk(sk)->icsk_ca_priv; + if (ca->ca_ops->undo_cwnd) + return ca->ca_ops->undo_cwnd(sk); + else + return max(tp->snd_cwnd, tp->snd_ssthresh<<1); +} + +static void tas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +{ + struct tcp_tas *ca = (struct tcp_tas *)inet_csk(sk)->icsk_ca_priv; + if (ca->ca_ops->get_info) + ca->ca_ops->get_info(sk,ext,skb); +} + +static void tas_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + struct tcp_tas *ca = (struct tcp_tas *)inet_csk(sk)->icsk_ca_priv; + if (ca->ca_ops->cwnd_event) + ca->ca_ops->cwnd_event(sk,event); +} + + + +static void tas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) +{ + struct tcp_tas *ca = (void *)inet_csk(sk)->icsk_ca_priv; + struct tcp_sock *tp = tcp_sk(sk); + + if ((tp->srtt>>3) < ca->minrtt){ + ca->minrtt = (tp->srtt>>3); + } + if (!sysctl_tcp_as_mode && ca->last_check && tp->initial_bw_est.bdp) { + if (sysctl_tcp_as_debug) + printk("----------TCP_AS LOG ----------- Cong Avoid: call to change_ca, now %d, last_check %u, rtt %u ,minrtt %u, nrtt %u, MODE %d \n",(int)jiffies,ca->last_check,tp->srtt>>3,ca->minrtt,sysctl_tcp_as_nrtt,sysctl_tcp_as_mode); + tas_change_ca(sk); + ca->last_check=0; + } + else if(sysctl_tcp_as_mode && (jiffies-ca->last_check >= ca->minrtt*sysctl_tcp_as_nrtt)) { + if (sysctl_tcp_as_debug) + printk("----------TCP_AS LOG ----------- Cong Avoid: call to change_ca, now %d, last_check %u, rtt %u ,minrtt %u, nrtt %u, MODE %d \n",(int)jiffies,ca->last_check,tp->srtt>>3,ca->minrtt,sysctl_tcp_as_nrtt,sysctl_tcp_as_mode); + ca->last_check=jiffies; + tas_change_ca(sk); + } + ca->ca_ops->cong_avoid(sk,ack,in_flight); + +} + +void tas_moderate_cwnd(struct sock *sk, struct tcp_sock *tp) { + struct tcp_tas *ca = (void *)inet_csk(sk)->icsk_ca_priv; + if (ca->ca_ops->moderate_cwnd) + ca->ca_ops->moderate_cwnd(sk,tp); + else { + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp)+tcp_max_burst(tp)); + tp->snd_cwnd_stamp = tcp_time_stamp; + } +} + +/* +enum { + NET_TCP_AS_DEBUG=140, + NET_TCP_AS_MODE=141, + NET_TCP_AS_NRTT=142, + NET_TCP_AS_ALG=143, +}; + +static ctl_table tcp_as_ctl_table[] = { + { + .ctl_name = NET_TCP_AS_DEBUG, + .procname ="tcp_as_debug", + .data =&sysctl_tcp_as_debug, + .maxlen =sizeof(int), + .mode = 0644, + .proc_handler =&proc_dointvec + }, + { + .ctl_name = NET_TCP_AS_MODE, + .procname ="tcp_as_mode", + .data =&sysctl_tcp_as_mode, + .maxlen =sizeof(int), + .mode = 0644, + .proc_handler =&proc_dointvec + }, + { + .ctl_name = NET_TCP_AS_NRTT, + .procname ="tcp_as_nrtt", + .data =&sysctl_tcp_as_nrtt, + .maxlen =sizeof(int), + .mode = 0644, + .proc_handler =&proc_dointvec + }, + { + .ctl_name = NET_TCP_AS_ALG, + .procname ="tcp_as_alg", + .data =&sysctl_tcp_as_alg, + .maxlen =sizeof(int), + .mode = 0644, + .proc_handler =&proc_dointvec + }, + {.ctl_name=0} +}; +*/ +struct tcp_congestion_ops tcp_as = { + .init = tas_init, + .release = tas_release, + .ssthresh = tas_ssthresh, + .min_cwnd = tas_min_cwnd, + .cong_avoid = tas_cong_avoid, + .set_state = tas_state, + .undo_cwnd = tas_undo_cwnd, + .pkts_acked = tas_pkts_acked, + .cwnd_event = tas_cwnd_event, + .get_info = tas_get_info, + .moderate_cwnd = tas_moderate_cwnd, + + .owner = THIS_MODULE, + .name = "as" +}; + + /* static struct ctl_table_header *tcp_as_table_header; */ + +static int __init tas_register(void) +{ + BUILD_BUG_ON(sizeof(struct tcp_tas) > (19 * sizeof(u32))); +/* + if (!(tcp_as_table_header = register_sysctl_table(tcp_as_ctl_table))) + return EPERM; +*/ + request_module("tcp_highspeed"); + request_module("tcp_htcp"); + request_module("tcp_hybla"); + request_module("tcp_vegas"); + request_module("tcp_westwood"); + request_module("tcp_bic"); + request_module("tcp_cubic"); + return tcp_register_congestion_control(&tcp_as); + +} + +static void __exit tas_unregister(void) +{ +/* + if (tcp_as_table_header) + unregister_sysctl_table(tcp_as_table_header); +*/ + tcp_unregister_congestion_control(&tcp_as); +} + +module_init(tas_register); +module_exit(tas_unregister); +MODULE_AUTHOR("Angelo Tornese"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Adaptive Selection"); + diff -ruN linux-2.6.26/net/ipv4/tcp_cong.c linux-2.6.26-MTCP-development/net/ipv4/tcp_cong.c --- linux-2.6.26/net/ipv4/tcp_cong.c 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/net/ipv4/tcp_cong.c 2009-03-03 22:03:42.000000000 +0100 @@ -30,6 +30,48 @@ return NULL; } +#ifdef CONFIG_TCP_CONG_AS +int sysctl_tcp_as_alg=0; +int sysctl_tcp_as_debug=0; +int sysctl_tcp_as_mode=0; +int sysctl_tcp_as_nrtt=10; + +EXPORT_SYMBOL_GPL(sysctl_tcp_as_alg); +EXPORT_SYMBOL_GPL(sysctl_tcp_as_debug); +EXPORT_SYMBOL_GPL(sysctl_tcp_as_mode); +EXPORT_SYMBOL_GPL(sysctl_tcp_as_nrtt); + +struct tcp_congestion_ops *tcp_tas_select_ca_ops(char *name,int *err) +{ + struct tcp_congestion_ops *cops; + spin_lock(&tcp_cong_list_lock); + cops = tcp_ca_find(name); + spin_unlock(&tcp_cong_list_lock); + if (!cops) + *err = -ENOENT; + else if (!try_module_get(cops->owner)) + *err = -EBUSY; + else { + *err=0; + } + return cops; +} +EXPORT_SYMBOL_GPL(tcp_tas_select_ca_ops); +#else +int sysctl_tcp_as_alg=0; +int sysctl_tcp_as_debug=0; +int sysctl_tcp_as_mode=0; +int sysctl_tcp_as_nrtt=10; +EXPORT_SYMBOL_GPL(sysctl_tcp_as_alg); +EXPORT_SYMBOL_GPL(sysctl_tcp_as_debug); +EXPORT_SYMBOL_GPL(sysctl_tcp_as_mode); +EXPORT_SYMBOL_GPL(sysctl_tcp_as_nrtt); + +struct tcp_congestion_ops *tcp_tas_select_ca_ops(char *name,int *err) {return NULL;} +EXPORT_SYMBOL_GPL(tcp_tas_select_ca_ops); +#endif + + /* * Attach new congestion control algorithm to the list * of available options. @@ -282,6 +324,10 @@ const struct tcp_sock *tp = tcp_sk(sk); u32 left; +#ifdef CONFIG_TCP_PACING + if(sysctl_tcp_pacing) + return 1; +#endif if (in_flight >= tp->snd_cwnd) return 1; diff -ruN linux-2.6.26/net/ipv4/tcp_hybla.c linux-2.6.26-MTCP-development/net/ipv4/tcp_hybla.c --- linux-2.6.26/net/ipv4/tcp_hybla.c 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/net/ipv4/tcp_hybla.c 2009-03-03 22:03:42.000000000 +0100 @@ -22,6 +22,7 @@ u32 rho_3ls; /* Rho parameter, <<3 */ u32 rho2_7ls; /* Rho^2, <<7 */ u32 minrtt; /* Minimum smoothed round trip time value seen */ + u32 mincwnd; /* Minumum for cwnd in Recovery */ }; /* Hybla reference round trip time (default= 1/40 sec = 25 ms), @@ -66,8 +67,19 @@ static void hybla_state(struct sock *sk, u8 ca_state) { + struct tcp_sock *tp = tcp_sk(sk); struct hybla *ca = inet_csk_ca(sk); - ca->hybla_en = (ca_state == TCP_CA_Open); +// ca->hybla_en = (ca_state == TCP_CA_Open); + if(ca_state == TCP_CA_Recovery){ + tp->snd_ssthresh = max_t(u32, 2U, tcp_packets_in_flight(tp) >> 1); + ca->mincwnd=tcp_packets_in_flight(tp)/4; +#ifdef CONFIG_TCP_PACING + if(tcp_should_est(sk)){ + tp->initial_bw_est.bdp = tp->snd_ssthresh << 1; + } +#endif + } + } static inline u32 hybla_fraction(u32 odds) @@ -97,9 +109,10 @@ hybla_recalc_param(sk); ca->minrtt = tp->srtt; } - +#ifndef CONFIG_TCP_PACING if (!tcp_is_cwnd_limited(sk, in_flight)) return; +#endif if (!ca->hybla_en) { tcp_reno_cong_avoid(sk, ack, in_flight); @@ -141,9 +154,22 @@ } odd = increment % 128; +#ifdef CONFIG_TCP_PACING + if (!tcp_is_cwnd_limited(sk, in_flight + (ca->rho2_7ls >> 7))) + return; +#endif + tp->snd_cwnd += increment >> 7; ca->snd_cwnd_cents += odd; + /* check when cwnd has not been incremented for a while*/ + if (increment==0 && odd==0 && tp->snd_cwnd_cnt>=tp->snd_cwnd) + { + tp->snd_cwnd++; + tp->snd_cwnd_cnt=0; + } + + /* check when fractions goes >=128 and increase cwnd by 1. */ while (ca->snd_cwnd_cents >= 128) { tp->snd_cwnd++; @@ -158,12 +184,26 @@ tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); } +static u32 hybla_min_cwnd (const struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct hybla *ca = inet_csk_ca(sk); + return max_t(u32, tcp_packets_in_flight(tp) / 4, ca->mincwnd); +} + +static void hybla_moderate_cwnd(struct sock *sk, struct tcp_sock *tp) +{ + return; +} + + static struct tcp_congestion_ops tcp_hybla = { .init = hybla_init, .ssthresh = tcp_reno_ssthresh, - .min_cwnd = tcp_reno_min_cwnd, + .min_cwnd = hybla_min_cwnd, .cong_avoid = hybla_cong_avoid, .set_state = hybla_state, + .moderate_cwnd = hybla_moderate_cwnd, .owner = THIS_MODULE, .name = "hybla" diff -ruN linux-2.6.26/net/ipv4/tcp_input.c linux-2.6.26-MTCP-development/net/ipv4/tcp_input.c --- linux-2.6.26/net/ipv4/tcp_input.c 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/net/ipv4/tcp_input.c 2009-03-03 22:03:42.000000000 +0100 @@ -113,6 +113,280 @@ #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) #define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) +#ifdef CONFIG_TCP_PACING + +/* Initial ssthresh estimation, main engine. + * We try to get a better ssthresh value, based + * upon an estimation on bdp, to avoid overshoot, + * as suggested in: + * J.Hoe, "Improving the start-up behavior of a + * Congestion Control scheme for TCP" + * + * + */ + +int sysctl_tcp_initial_bw_est=0; + +int tcp_should_est(struct sock *sk){ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + if (icsk->icsk_ca_state != TCP_CA_Open) { + tp->initial_bw_est.bdp = tp->snd_ssthresh << 1; + return 0; + } + return (sysctl_tcp_initial_bw_est && tp->initial_bw_est.bdp == 0 && tp->counter.snt0); +} + +EXPORT_SYMBOL(tcp_should_est); + +static void __do_initial_bw_est(struct sock *sk, u32 ack) +{ + struct tcp_sock *tp=tcp_sk(sk); + /* ok, we have the second ack. Let's compute a value for bdp */ + unsigned long delay; + unsigned long trip; + __u32 datasize, bpms=0; + + datasize=(tp->initial_bw_est.seq2 - tp->initial_bw_est.seq1); + trip=(tp->initial_bw_est.time_ack1 - tp->initial_bw_est.time_snd); + delay=(tp->initial_bw_est.time_ack2 - tp->initial_bw_est.time_ack1); + + if (delay < 2) { + /* Repeat estimation: time is less than two millisecond */ + printk (KERN_DEBUG "Bandwidth estimator: repeat estimation\n"); + tp->initial_bw_est.seq1 = 0; + tp->initial_bw_est.seq2 = 0; + tp->initial_bw_est.time_snd = 0; + tp->initial_bw_est.time_ack1 = 0; + tp->initial_bw_est.time_ack2 = 0; + return; + } + bpms = ( datasize / delay ) ; + if(sysctl_tcp_uob_dbg) + printk (KERN_DEBUG "Bandwidth estimator: \n\t ack1@ %lu, \n\t ack2@ %lu, \n\t bandwidth:%u B/ms \n\n", tp->initial_bw_est.time_ack1, tp->initial_bw_est.time_ack2, bpms); + tp->initial_bw_est.bdp = max_t(u32, 1U, (bpms * trip) / tp->mss_cache); + /* Set the new threshold */ + tp->snd_ssthresh = max_t(u32, 2U, (tp->initial_bw_est.bdp)>>1); + + tcp_pacing_recalc_delta(sk); +} + +void __initial_bw_est(struct sock *sk, u32 ack) +{ + struct tcp_sock *tp=tcp_sk(sk); + if(ack == tp->initial_bw_est.seq1 + tp->mss_cache){ + tp->initial_bw_est.time_ack1=jiffies; + if(sysctl_tcp_uob_dbg) + printk(KERN_DEBUG "ack1 @ %lu\n",jiffies-tp->counter.zerotime); + return; + } + if (ack == tp->initial_bw_est.seq2 + tp->mss_cache){ + + if (tp->initial_bw_est.time_ack1==0) + goto reset; + /* first ack skipped, try again */ + + tp->initial_bw_est.time_ack2=jiffies; + if(sysctl_tcp_uob_dbg) + printk(KERN_DEBUG "ack2 @ %lu\n",jiffies-tp->counter.zerotime); + + __do_initial_bw_est(sk,ack); + } + return; + +reset: + tp->initial_bw_est.seq1=0; + tp->initial_bw_est.seq2=0; + tp->initial_bw_est.time_snd=0; + return; +} +EXPORT_SYMBOL(__initial_bw_est); + +void __set_be_endpoint(struct tcp_sock *tp, u32 seq) +{ + if (jiffies == tp->initial_bw_est.time_snd) + { + if(sysctl_tcp_uob_dbg) + printk(KERN_DEBUG "set2 @ segment %u\n", seq-tp->counter.snt0); + tp->initial_bw_est.seq2=seq; + } + + if (tp->initial_bw_est.seq2) + return; + + // sets 1st seqn + if ((seq-tp->counter.snt0)>(tp->mss_cache<<1)){ + if(sysctl_tcp_uob_dbg) + printk(KERN_DEBUG "set1 @ segment %u\n", seq-tp->counter.snt0); + tp->initial_bw_est.seq1=seq; + tp->initial_bw_est.time_snd=jiffies; + } +} +EXPORT_SYMBOL(__set_be_endpoint); + + + +#endif + +#ifdef CONFIG_TCP_MULTITCP +int sysctl_tcp_init_ssthresh=0; +int sysctl_tcp_uob_dbg=0; + + +#ifdef CONFIG_TCP_ACHIEVED_RATE +int sysctl_tcp_achieved_rate = 1; +int sysctl_tcp_achieved_rate_delta = 0; +static u8 achieved_rate_recalc(struct sock *sk, u32 ack) +{ + struct tcp_sock *tp=tcp_sk(sk); + u32 sample; + u32 _ack = ack-tp->counter.ack0; + u32 new_ack = _ack + tp->sacked_out * tp->mss_cache; + + if (tp->counter.achieved.bad_sample) { + tp->counter.achieved.bad_sample = 0; + tp->counter.achieved.delta >>= 1; + } else { + tp->counter.achieved.delta = sysctl_tcp_achieved_rate_delta; + if (tp->counter.achieved.delta == 0) + tp->counter.achieved.delta = tp->srtt >> 3; + } + + + if (tp->counter.achieved.time0 == 0) { + tp->counter.achieved.time0 = jiffies; + tp->counter.achieved.ack0 = new_ack; + } + if ((jiffies - tp->counter.achieved.time0) > tp->counter.achieved.delta){ + if (tp->counter.achieved.ack0 < ack) { + sample = (( new_ack - tp->counter.achieved.ack0)<<3) / (jiffies - tp->counter.achieved.time0); + tp->counter.achieved.rate = ( (tp->counter.achieved.rate * 0) + sample * 4) >> 2; + if (sysctl_tcp_uob_dbg == 2) + printk(KERN_DEBUG "tcp_ack: %u, %lu, %u, %u, %u, %lu, %u, %lu, %u, %u, %u\n",tp->counter.snt0,jiffies-tp->counter.zerotime,_ack,(tp->sacked_out*tp->mss_cache), tp->counter.achieved.ack0, tp->counter.achieved.time0 - tp->counter.zerotime, new_ack, jiffies - tp->counter.zerotime, tp->counter.achieved.delta, sample, tp->counter.achieved.rate); + tp->counter.achieved.time0 = jiffies; + tp->counter.achieved.ack0 = new_ack; + return 1U; + } else { + tp->counter.achieved.bad_sample = 1; + tp->counter.achieved.delta <<= 1; + } + } + return 0; +} +#else +#define achieved_rate_recalc(sk,ack) 0 +#define sysctl_achieved_rate 0 +#endif + +void __multitcp_ack_event(struct sock *sk, u32 ack) +{ + struct tcp_sock *tp=tcp_sk(sk); + __u32 _una,_ack,_ack0,_snt0; + int new_rate = 0; + + if (tp->counter.zerotime){ + /* Set counter.ack0 field */ + if(tp->counter.ack0 == 0){ + tp->counter.ack0=ack; + } + + _ack0=tp->counter.ack0; + _snt0=tp->counter.snt0; + _una=tp->snd_una-_snt0+tp->mss_cache; + _ack=ack-_ack0; + if (_ack==0) _una=0; + + if (sysctl_tcp_achieved_rate == 1) { + new_rate = achieved_rate_recalc(sk, ack); + } + + if(sysctl_tcp_uob_dbg == 1){ + if (new_rate) + printk(KERN_DEBUG "tcp_ack: %u, %lu, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",tp->counter.snt0,jiffies-tp->counter.zerotime,_ack,ack,_una,tcp_packets_in_flight(tp),0,tp->srtt>>3,tp->rttvar, inet_csk(sk)->icsk_rto, tp->snd_wnd, tp->packets_out, tp->sacked_out, tp->fackets_out, tp->lost_out, tp->retrans_out, tp->counter.achieved.rate); + else + printk(KERN_DEBUG "tcp_ack: %u, %lu, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",tp->counter.snt0,jiffies-tp->counter.zerotime,_ack,ack,_una,tcp_packets_in_flight(tp),0,tp->srtt>>3,tp->rttvar, inet_csk(sk)->icsk_rto, tp->snd_wnd, tp->packets_out, tp->sacked_out, tp->fackets_out, tp->lost_out, tp->retrans_out); + } + } + tcp_initial_bw_est(sk,ack); +} + +EXPORT_SYMBOL(__multitcp_ack_event); + +void __multitcp_cwnd_log(struct tcp_sock *tp, const char *caller){ + if(sysctl_tcp_uob_dbg == 1){ + printk(KERN_DEBUG "tcp_ca: %u, %lu, %u, %u, %u, ,%s\n",tp->counter.snt0,jiffies-tp->counter.zerotime, tp->snd_cwnd, tp->snd_ssthresh, tp->snd_cwnd_clamp, caller); + } +} +EXPORT_SYMBOL(__multitcp_cwnd_log); + +void __multitcp_cwnd_event(struct sock *sk, const char *caller){ + struct tcp_sock *tp=tcp_sk(sk); + if(sysctl_tcp_uob_dbg == 1){ + printk(KERN_DEBUG "tcp_ca: %u, %lu, %u, %u, %u, %u, %s, %s\n",tp->counter.snt0,jiffies-tp->counter.zerotime, tp->snd_cwnd, tp->snd_ssthresh, tp->snd_cwnd_clamp, inet_csk(sk)->icsk_ca_state, caller, inet_csk(sk)->icsk_ca_ops->name); + } +} +EXPORT_SYMBOL(__multitcp_cwnd_event); + +/* TCP DEBUG: Log a Send event. */ +static inline void debug_snd_event(struct tcp_sock *tp, u32 seq) +{ + __u32 pktsize,_rseq; + if (tp->counter.zerotime >0){ + + if (tp->counter.snt0==0){ + tp->counter.snt0=seq; + tp->counter.ack0=seq; + } + _rseq=seq-tp->counter.snt0; + if(_rseq > tp->counter.data){ + // New packet + tp->counter.snt++; + pktsize=_rseq-tp->counter.data; + tp->counter.data=_rseq; + + }else{ + // Retransmission + pktsize=0; + } + if(sysctl_tcp_uob_dbg == 1){ +#ifdef CONFIG_TCP_PACING + printk(KERN_DEBUG "tcp_snd: %u, %lu, %u, %u, %u, %u, %u, %u\n",tp->counter.snt0,jiffies-tp->counter.zerotime,_rseq,seq,pktsize,tp->snd_una,tp->pacing.delta,tp->pacing.burst); +#else + printk(KERN_DEBUG "tcp_snd: %u, %lu, %u, %u, %u, %u, off, off\n",tp->counter.snt0,jiffies-tp->counter.zerotime,_rseq,seq,pktsize,tp->snd_una); +#endif + } + } +} + +void __multitcp_snd_event(struct sock *sk, u32 seq){ + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->counter.zerotime){ + tp->counter.zerotime=jiffies; + tp->counter.snt=0; + tp->counter.data=0; + if(sysctl_tcp_uob_dbg == 1 ){ + printk(KERN_DEBUG "tcp_snd: Conn_ID, TIME(ms), RELATIVE SEQ_N, ABSOLUTE SEQ_N, SEG_SIZE (=0 IF RETR), ACK_EXPECTED, SPAC_D, SPAC_BURST\n"); + printk(KERN_DEBUG "tcp_ack: Conn_ID, TIME(ms), RELATIVE ACK, ABSOLUTE ACK, ACK_EXPECTED, PRIOR_IN_FLIGHT, RTTI, RTTS, RTTVAR, RTO, SND_WND, PACKETS_OUT, SACKED_OUT, FACKETS_OUT, LOST_OUT, RETRANS_OUT, ACHIEVED_RATE\n"); + + printk(KERN_DEBUG "tcp_ca: Conn_ID, TIME(ms), CWND, SSTHRESH, CWND_CLAMP, CA_STATE, CALLER, CONGESTION_CONTROL\n"); + } + if (sysctl_tcp_uob_dbg == 2) { + printk(KERN_DEBUG "tcp_ack: CONN_ID, TIME (ms), ACKED (B), SACKED (B), ACK0_VALUE (B), ACK0_TIME (ms), ACK1_VALUE (B), ACK1_TIME (ms), DELTA (ms), ACHIEVED_SAMPLE (Kbit/s), ACHIEVED_RATE (Kbit/s)\n"); + } + } + + if(tcp_should_est(sk)) + tcp_set_be_endpoint(tp, seq); + + debug_snd_event(tp, seq); +} +EXPORT_SYMBOL(__multitcp_snd_event); + +#endif +/*** END MultiTCP Code ***/ + + + #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) @@ -850,6 +1124,7 @@ TCP_ECN_queue_cwr(tp); tcp_set_ca_state(sk, TCP_CA_CWR); + multitcp_cwnd_event(sk,__FUNCTION__); } } @@ -1330,6 +1605,7 @@ tcp_for_write_queue_from(skb, sk) { int in_sack = 0; int dup_sack = dup_sack_in; + multitcp_cwnd_event(sk,__FUNCTION__); if (skb == tcp_send_head(sk)) break; @@ -1648,8 +1924,10 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_limit_reno_sacked(tp)) + if (tcp_limit_reno_sacked(tp)) { tcp_update_reordering(sk, tp->packets_out + addend, 0); + multitcp_cwnd_event(sk,__FUNCTION__); + } } /* Emulate SACKs for SACKless connection: account for a new dupack. */ @@ -1911,6 +2189,8 @@ tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; + multitcp_cwnd_event(sk,__FUNCTION__); + tp->bytes_acked = 0; tcp_clear_retrans_partial(tp); @@ -1946,6 +2226,7 @@ tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); + multitcp_cwnd_event(sk,__FUNCTION__); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); /* Abort F-RTO algorithm if one is in progress */ @@ -2266,13 +2547,20 @@ /* CWND moderation, preventing bursts due to too big ACKs * in dubious situations. */ -static inline void tcp_moderate_cwnd(struct tcp_sock *tp) +static inline void tcp_moderate_cwnd(struct sock *sk, struct tcp_sock *tp) { - tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp) + tcp_max_burst(tp)); - tp->snd_cwnd_stamp = tcp_time_stamp; + struct inet_connection_sock *icsk = inet_csk(sk); + if (icsk->icsk_ca_ops->moderate_cwnd) { + icsk->icsk_ca_ops->moderate_cwnd(sk, tp); + return; + } + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp)+tcp_max_burst(tp)); + tp->snd_cwnd_stamp = tcp_time_stamp; + multitcp_cwnd_log(tp, __FUNCTION__); } + /* Lower bound on congestion window is slow start threshold * unless congestion avoidance choice decides to overide it. */ @@ -2288,6 +2576,8 @@ { struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; + if (tp->may_keep_cwnd) + goto out; if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) || (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) { @@ -2300,6 +2590,8 @@ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); tp->snd_cwnd_stamp = tcp_time_stamp; } +out: + multitcp_cwnd_event(sk, __FUNCTION__); } /* Nothing was retransmitted or returned timestamp is less @@ -2363,7 +2655,7 @@ } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } - tcp_moderate_cwnd(tp); + tcp_moderate_cwnd(sk, tp); tp->snd_cwnd_stamp = tcp_time_stamp; /* There is something screwy going on with the retrans hints after @@ -2397,10 +2689,11 @@ /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ - tcp_moderate_cwnd(tp); + tcp_moderate_cwnd(sk, tp); return 1; } tcp_set_ca_state(sk, TCP_CA_Open); + multitcp_cwnd_event(sk, __FUNCTION__); return 0; } @@ -2494,6 +2787,7 @@ if (inet_csk(sk)->icsk_ca_state != state) { tcp_set_ca_state(sk, state); tp->high_seq = tp->snd_nxt; + multitcp_cwnd_event(sk, __FUNCTION__); } } @@ -2511,7 +2805,7 @@ if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); - tcp_moderate_cwnd(tp); + tcp_moderate_cwnd(sk, tp); } else { tcp_cwnd_down(sk, flag); } @@ -2555,7 +2849,7 @@ * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ -static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) +static void tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, int pkts_acked, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2648,7 +2942,7 @@ if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); if (!tcp_try_undo_loss(sk)) { - tcp_moderate_cwnd(tp); + tcp_moderate_cwnd(sk, tp); tcp_xmit_retransmit_queue(sk); return; } @@ -2705,8 +2999,18 @@ tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); fast_rexmit = 1; + tp->may_keep_cwnd = 0; + multitcp_cwnd_event(sk,"tcp_fastretrans_alert: Setting may keep cwnd = 0"); + multitcp_cwnd_event(sk, __FUNCTION__); + goto score; + } + + if ((tp->snd_una > prior_snd_una) && icsk->icsk_ca_state==TCP_CA_Recovery){ + multitcp_cwnd_event(sk,"tcp_fastretrans_alert: Setting may keep cwnd = 1*"); + tp->may_keep_cwnd = 1; } +score: if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) tcp_update_scoreboard(sk, fast_rexmit); tcp_cwnd_down(sk, flag); @@ -2775,8 +3079,20 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { const struct inet_connection_sock *icsk = inet_csk(sk); + if(sysctl_tcp_init_ssthresh && tcp_sk(sk)->snd_ssthresh > 65535U) + tcp_sk(sk)->snd_ssthresh=sysctl_tcp_init_ssthresh; + + if(tcp_should_est(sk)){ + tcp_reno_cong_avoid(sk, ack, in_flight); + tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; + multitcp_cwnd_event(sk, __FUNCTION__); + return; + } + + icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; + multitcp_cwnd_event(sk, __FUNCTION__); } /* Restart timer after forward progress on connection. @@ -3066,13 +3382,14 @@ /* A very conservative spurious RTO response algorithm: reduce cwnd and * continue in congestion avoidance. */ -static void tcp_conservative_spur_to_response(struct tcp_sock *tp) +static void tcp_conservative_spur_to_response(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_cnt = 0; tp->bytes_acked = 0; TCP_ECN_queue_cwr(tp); - tcp_moderate_cwnd(tp); + tcp_moderate_cwnd(sk, tp); } /* A conservative spurious RTO response algorithm: reduce cwnd using @@ -3190,7 +3507,7 @@ tcp_undo_spur_to_response(sk, flag); break; case 1: - tcp_conservative_spur_to_response(tp); + tcp_conservative_spur_to_response(sk); break; default: tcp_ratehalving_spur_to_response(sk); @@ -3215,6 +3532,9 @@ u32 prior_fackets; int prior_packets; int frto_cwnd = 0; + + multitcp_ack_event(sk, ack); + tcp_pacing_recalc_delta(sk); /* If the ack is newer than sent or older than previous acks * then we can probably ignore it. @@ -3292,7 +3612,7 @@ if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight); - tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, + tcp_fastretrans_alert(sk, prior_snd_una, prior_packets - tp->packets_out, flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) @@ -4390,7 +4710,7 @@ sk->sk_write_space(sk); } -static void tcp_check_space(struct sock *sk) +void tcp_check_space(struct sock *sk) { if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); @@ -4488,6 +4808,7 @@ * tp->copied_seq since we would read the last urgent byte again * as data, nor can we alter copied_seq until this data arrives * or we break the semantics of SIOCATMARK (and thus sockatmark()) + multitcp_cwnd_event(sk, __FUNCTION__); * * NOTE. Double Dutch. Rendering to plain English: author of comment * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); @@ -4623,6 +4944,7 @@ (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { tp->ucopy.wakeup = 1; sk->sk_data_ready(sk, 0); + multitcp_cwnd_event(sk, __FUNCTION__); } } else if (chunk > 0) { tp->ucopy.wakeup = 1; @@ -4634,6 +4956,7 @@ #endif /* CONFIG_NET_DMA */ /* + multitcp_cwnd_event(sk, __FUNCTION__); * TCP receive function for the ESTABLISHED state. * * It is split into a fast path and a slow path. The fast path is @@ -5245,7 +5568,7 @@ tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); /* step 3: check security and precedence [ignored] */ - + /* step 4: * * Check for a SYN in window. @@ -5256,6 +5579,7 @@ return 1; } + /* step 5: check the ACK field */ if (th->ack) { int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); @@ -5425,3 +5749,5 @@ EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); EXPORT_SYMBOL(tcp_initialize_rcv_mss); +EXPORT_SYMBOL(tcp_check_space); + diff -ruN linux-2.6.26/net/ipv4/tcp_output.c linux-2.6.26-MTCP-development/net/ipv4/tcp_output.c --- linux-2.6.26/net/ipv4/tcp_output.c 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/net/ipv4/tcp_output.c 2009-03-03 22:03:42.000000000 +0100 @@ -41,6 +41,8 @@ #include #include +int sysctl_tcp_rto_max = 120; + /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse __read_mostly = 1; @@ -508,6 +510,18 @@ #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 + multitcp_snd_event(sk,tcb->seq); + if (tcp_pacing_enabled(sk)){ + tp->pacing.sent++; + + + if ((tp->pacing.burst) && (tp->pacing.burst <= tp->pacing.sent)){ + tcp_pacing_lock_tx(sk); + tp->pacing.sent = 0; + tcp_pacing_reset_timer(sk); + } + } + sysctl_flags = 0; if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; @@ -976,10 +990,10 @@ int doing_tso = 0; mss_now = tp->mss_cache; - +#ifndef CONFIG_TCP_MULTITCP if (large_allowed && sk_can_gso(sk) && !tp->urg_mode) doing_tso = 1; - +#endif if (dst) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) @@ -1259,6 +1273,12 @@ const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; + /* TCP spacing conflicts with John's algorithm, so turn it off + * in case of rate-based sender. + */ + if(tcp_pacing_enabled(sk)) + goto send_now; + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) goto send_now; @@ -1454,7 +1474,6 @@ struct sk_buff *skb; unsigned int tso_segs, sent_pkts; int cwnd_quota; - int result; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all @@ -1464,15 +1483,16 @@ return 0; sent_pkts = 0; - +#if 0 /* Do MTU probing. */ if ((result = tcp_mtu_probe(sk)) == 0) { return 0; } else if (result > 0) { sent_pkts = 1; } - - while ((skb = tcp_send_head(sk))) { +#endif + //printk(KERN_INFO "%s. pacing enbaled: %c, sent: %lu burst: %lu\n", __FUNCTION__, tcp_pacing_enabled(sk)?'y':'n', tp->pacing.sent, tcp_pacing_burst(sk)); + while ((skb = tcp_send_head(sk)) && (!tcp_pacing_enabled(sk) || tp->pacing.sent < tcp_pacing_burst(sk))) { unsigned int limit; tso_segs = tcp_init_tso_segs(sk, skb, mss_now); @@ -1484,7 +1504,9 @@ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break; - + if (tcp_pacing_locked(sk)){ + return 0; + } if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? @@ -1503,6 +1525,7 @@ if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now))) break; + TCP_SKB_CB(skb)->when = tcp_time_stamp; @@ -1516,8 +1539,9 @@ tcp_minshall_update(tp, mss_now, skb); sent_pkts++; + } - + if (likely(sent_pkts)) { tcp_cwnd_validate(sk); return 0; @@ -1903,7 +1927,9 @@ skb->ip_summed = CHECKSUM_NONE; } } - + if (tcp_pacing_locked(sk)) + return -EAGAIN; + /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ diff -ruN linux-2.6.26/net/ipv4/tcp_timer.c linux-2.6.26-MTCP-development/net/ipv4/tcp_timer.c --- linux-2.6.26/net/ipv4/tcp_timer.c 2008-07-13 23:51:29.000000000 +0200 +++ linux-2.6.26-MTCP-development/net/ipv4/tcp_timer.c 2009-03-03 22:03:42.000000000 +0100 @@ -36,10 +36,22 @@ static void tcp_delack_timer(unsigned long); static void tcp_keepalive_timer (unsigned long data); +#ifdef CONFIG_TCP_PACING +int sysctl_tcp_pacing = 0; +EXPORT_SYMBOL_GPL(sysctl_tcp_pacing); +static void tcp_pacing_timer(unsigned long data); +#endif + void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); +#ifdef CONFIG_TCP_PACING + init_timer(&(tcp_sk(sk)->pacing.timer)); + tcp_sk(sk)->pacing.timer.function=&tcp_pacing_timer; + tcp_sk(sk)->pacing.timer.data = (unsigned long) sk; +#endif + } EXPORT_SYMBOL(tcp_init_xmit_timers); @@ -535,3 +547,127 @@ bh_unlock_sock(sk); sock_put(sk); } + +#ifdef CONFIG_TCP_PACING +/* Routines for TCP Pacing. + * + * Amit Aggarwal, Stefan Savage, and Thomas Anderson, "Understanding the Performance of TCP Pacing" + * Proc. of the IEEE INFOCOM 2000 Conference on Computer Communications, March 2000, pages 1157 - 1165. + * + * This is the timer used to spread packets. + * a delta value is computed on rtt/cwnd, + * and will be our expire interval. + */ +static void tcp_pacing_timer(unsigned long data) +{ + struct sock *sk = (struct sock*) data; + struct tcp_sock *tp = tcp_sk(sk); + + if (!sysctl_tcp_pacing) + return; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later */ + if (!mod_timer(&tp->pacing.timer, jiffies + 1)) + sock_hold(sk); + goto out_unlock; + } + + if (sk->sk_state == TCP_CLOSE) + goto out; + + /* Unlock sending, so when next ack is received it will pass. + * If there are no packets scheduled, do nothing. + */ + tp->pacing.lock = 0; + + if (!tcp_send_head(sk)){ + /* Sending queue empty */ + goto out; + } + + /* Handler */ + if(tp->lost_out) + tcp_xmit_retransmit_queue(sk); + else + tcp_push_pending_frames(sk); + + out: + if (tcp_memory_pressure) + sk_mem_reclaim(sk); + + out_unlock: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * The timer has to be restarted when a segment is sent out. + */ +void __tcp_pacing_reset_timer(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + __u32 timeout = jiffies + tp->pacing.delta; + + if (!mod_timer(&tp->pacing.timer, timeout)) + sock_hold(sk); +} +EXPORT_SYMBOL(__tcp_pacing_reset_timer); + +/* + * This routine computes tcp_pacing delay, using + * a simplified uniform pacing policy. + */ +void __tcp_pacing_recalc_delta(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + __u32 window = (tp->snd_cwnd)<<3; + __u32 srtt = tp->srtt; + __u32 round = 0; + __u32 curmss = tp->mss_cache; + int state = inet_csk(sk)->icsk_ca_state; + +// if (state == TCP_CA_Recovery && tp->snd_cwnd < tp->snd_ssthresh) +// window = tp->snd_ssthresh << 3; + + if (tp->snd_wnd/curmss < tp->snd_cwnd) + window = (tp->snd_wnd / curmss) << 3; + +// if (window>1 && srtt && state != TCP_CA_Recovery){ + if(window>1 && srtt){ + if (window <= srtt){ + tp->pacing.delta = (srtt/window); + if (srtt % window) + round=((srtt / (srtt % window))); + else + round=0; + if (round && (tp->pacing.count >= round)) + tp->pacing.count = 0; + if (round && tp->pacing.count == 0) + tp->pacing.delta++; + tp->pacing.burst = 1; + } else { + tp->pacing.delta = 1; + tp->pacing.burst = (window / srtt); + if (window % srtt) + round = ( (srtt / (window % srtt))); + else + round = 0; + if (round && tp->pacing.count >= (round)){ + tp->pacing.count = 0; + if (round && tp->pacing.count == 0) + tp->pacing.burst++; + } + } + tp->pacing.count++; + } else { + tp->pacing.delta = 0; + tp->pacing.burst = 1; + } +// printk(KERN_INFO "Pacing Delta= %lu, burst=%lu\n", tp->pacing.delta, tp->pacing.burst); +} + +EXPORT_SYMBOL(__tcp_pacing_recalc_delta); + +#endif