1. TCP接收窗口的调整
在上面配置的rcv_buf都是配置的接收缓存,在tcp层中接收窗口不能完全占满接收缓存,因为TCP层描述的接收窗口,仅仅是tcp层的data,不包含整个数据包的header部分,也就是不包含tcp header IP Header和一些选项信息。
在tcp三次握手的时候会通告各自的接收窗口,包括窗口中的scale因子,在使用wireshark抓包的时候,如果没有抓到三次握手的数据报文,在后续的报文中会有下面的提示信息,wireshark无法准确获取窗口的滑动因子。
在tcp的发送函数中tcp_transmit_skb中调用tcp_select_window函数,选择通告的win大小,接收窗口只有16位,所以最大能表示的65535字节,64KB的大小,可以通过添加的scale参数扩大接收缓存的大小。
1.1 tcp_transmit_skb
-
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
-
gfp_t gfp_mask)
-
{
-
……………………..
-
if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) //如果是SYN包,则单独处理tcp的选项信息
-
tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
-
else
-
tcp_options_size = tcp_established_options(sk, skb, &opts,
-
&md5);
-
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
-
………..
-
if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
-
/* RFC1323: The window in SYN & SYN/ACK segments
-
* is never scaled.
-
在三次握手阶段,接收窗口并没有按扩大因子进行缩放*/
-
th->window = htons(min(tp->rcv_wnd, 65535U));
-
} else {
-
th->window = htons(tcp_select_window(sk));
-
}
-
…………………………
-
}
-
-
static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
-
struct tcp_out_options *opts,
-
struct tcp_md5sig_key **md5)
-
{
-
struct tcp_sock *tp = tcp_sk(sk);
-
unsigned int remaining = MAX_TCP_OPTION_SPACE;
-
struct tcp_fastopen_request *fastopen = tp->fastopen_req;
-
-
#ifdef CONFIG_TCP_MD5SIG
-
*md5 = tp->af_specific->md5_lookup(sk, sk);
-
if (*md5) {
-
opts->options |= OPTION_MD5;
-
remaining -= TCPOLEN_MD5SIG_ALIGNED;
-
}
-
#else
-
*md5 = NULL;
-
#endif
-
-
opts->mss = tcp_advertise_mss(sk);
-
remaining -= TCPOLEN_MSS_ALIGNED;
-
-
if (likely(sysctl_tcp_timestamps && !*md5)) {
-
opts->options |= OPTION_TS;
-
opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
-
opts->tsecr = tp->rx_opt.ts_recent;
-
remaining -= TCPOLEN_TSTAMP_ALIGNED;
-
}
-
if (likely(sysctl_tcp_window_scaling)) { /* 如果开启sysctl_tcp_window_scaling 选项则在syn报文中添加scale的选项信息*/
-
opts->ws = tp->rx_opt.rcv_wscale;
-
opts->options |= OPTION_WSCALE;
-
remaining -= TCPOLEN_WSCALE_ALIGNED;
-
}
-
if (likely(sysctl_tcp_sack)) {
-
opts->options |= OPTION_SACK_ADVERTISE;
-
if (unlikely(!(OPTION_TS & opts->options)))
-
remaining -= TCPOLEN_SACKPERM_ALIGNED;
-
}
-
-
if (fastopen && fastopen->cookie.len >= 0) {
-
u32 need = fastopen->cookie.len;
-
-
need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
-
TCPOLEN_FASTOPEN_BASE;
-
need = (need + 3) & ~3U; /* Align to 32 bits */
-
if (remaining >= need) {
-
opts->options |= OPTION_FAST_OPEN_COOKIE;
-
opts->fastopen_cookie = &fastopen->cookie;
-
remaining -= need;
-
tp->syn_fastopen = 1;
-
tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
-
}
-
}
-
-
return MAX_TCP_OPTION_SPACE - remaining;
-
}
1.2 tcp_select_window
tcp_select_window函数更加当前的缓存,确定了最终window的大小。
-
static u16 tcp_select_window(struct sock *sk)
-
{
-
struct tcp_sock *tp = tcp_sk(sk);
-
u32 old_win = tp->rcv_wnd;
-
u32 cur_win = tcp_receive_window(tp); /*计算当前剩余接收窗口*/
-
u32 new_win = __tcp_select_window(sk);/*这里根据剩余的接收缓存,计算新的接收窗口*/
-
-
/* Never shrink the offered window 不允许缩小已经分配的窗口大小*/
-
if (new_win < cur_win) {
-
if (new_win == 0)
-
NET_INC_STATS(sock_net(sk),
-
LINUX_MIB_TCPWANTZEROWINDOWADV);
-
new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
-
}
-
tp->rcv_wnd = new_win;
-
tp->rcv_wup = tp->rcv_nxt;
-
-
/* Make sure we do not exceed the maximum possible
-
* scaled window.
-
确保接收窗口不超过规定的最大值*/
-
if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
-
new_win = min(new_win, MAX_TCP_WINDOW);
-
else
-
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
-
-
/* RFC1323 scaling applied 根据三次握手时确定的scale,按照scale缩小接收窗口的大小*/
-
new_win >>= tp->rx_opt.rcv_wscale;
-
-
/* If we advertise zero window, disable fast path. */
-
if (new_win == 0) {
-
tp->pred_flags = 0;
-
if (old_win)
-
NET_INC_STATS(sock_net(sk),
-
LINUX_MIB_TCPTOZEROWINDOWADV);
-
} else if (old_win == 0) {
-
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
-
}
-
/*返回新的可用的接收窗口,这个值就是在wireshark抓包时看到的值。*/
-
return new_win;
-
}
1.3 __tcp_select_window
-
u32 __tcp_select_window(struct sock *sk)
-
{
-
struct inet_connection_sock *icsk = inet_csk(sk);
-
struct tcp_sock *tp = tcp_sk(sk);
-
/* MSS for the peer's data. Previous versions used mss_clamp
-
* here. I don't know if the value based on our guesses
-
* of peer's MSS is better for the performance. It's more correct
-
* but may be worse for the performance because of rcv_mss
-
* fluctuations. --SAW 1998/11/1
-
*/
-
int mss = icsk->icsk_ack.rcv_mss; //接收端mss的大小
-
int free_space = tcp_space(sk); // 可以使用的接收缓存大小,是(sk->sk_rcvbuf-sk->sk_rmem_alloc) /2
-
int allowed_space = tcp_full_space(sk); // 为sk->sk_rcvbuf/2
-
int full_space = min_t(int, tp->window_clamp, allowed_space); //总的接收缓存
-
int window;
-
-
if (mss > full_space)
-
mss = full_space;
-
/* 如果可用的缓存小于总接收缓存的1/2的话,则说明内存吃紧*/
-
if (free_space < (full_space >> 1)) {
-
icsk->icsk_ack.quick = 0;
-
-
if (sk_under_memory_pressure(sk)) /*如果处于内存压力的状态,则修改接收窗口的阈值*/
-
tp->rcv_ssthresh = min(tp->rcv_ssthresh,
-
4U * tp->advmss);
-
-
/* free_space might become our new window, make sure we don't
-
* increase it due to wscale.
-
*/
-
free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
-
/*剩余接收缓存不足时,则直接返回0*/
-
if (free_space < (allowed_space >> 4) || free_space < mss)
-
return 0;
-
}
-
-
if (free_space > tp->rcv_ssthresh)
-
free_space = tp->rcv_ssthresh;
-
-
/* Don't do rounding if we are using window scaling, since the
-
* scaled window will not line up with the MSS boundary anyway.
-
*/
-
window = tp->rcv_wnd;
-
if (tp->rx_opt.rcv_wscale) { /*接收扩大因子不为0 的话,则重新设置window*/
-
window = free_space;
-
-
/* Advertise enough space so that it won't get scaled away.
-
* Import case: prevent zero window announcement if
-
* 1<<rcv_wscale > mss.
-
这里进行防治四舍五入导致的接收窗口较小*/
-
if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
-
window = (((window >> tp->rx_opt.rcv_wscale) + 1)
-
<< tp->rx_opt.rcv_wscale);
-
} else {
-
-
if (window <= free_space - mss || window > free_space)
-
window = (free_space / mss) * mss;
-
else if (mss == full_space &&
-
free_space > window + (full_space >> 1))
-
window = free_space;
-
}
-
/*返回及时后的window*/
-
return window;
-
}
1.4 tcp_space
-
/* Note: caller must be prepared to deal with negative returns */
-
static inline int tcp_space(const struct sock *sk)
-
{
-
/*这里的sk_rmem_alloc表示的是该socket已经被接收到的数据包占用的空间大小
-
http://vger.kernel.org/~davem/skb_sk.html */
-
return tcp_win_from_space(sk->sk_rcvbuf -
-
atomic_read(&sk->sk_rmem_alloc));
-
}
-
-
static inline int tcp_full_space(const struct sock *sk)
-
{
-
return tcp_win_from_space(sk->sk_rcvbuf);
-
}
-
-
static inline int tcp_win_from_space(int space)
-
{
-
/*这里sysctl_tcp_adv_win_scale 在较新的内核为1,原先为2,Upstream commit b49960a05e32121d29316cfdf653894b88ac9190 是修改的patch,里面说明了修改的原因。 如果sysctl_tcp_adv_win_scale为1的话,这里的tcp可以使用的空间有原先的3/4修改为1/2,说明tcp数据包中的其他字段开销变大了*/
-
return sysctl_tcp_adv_win_scale<=0 ?
-
(space>>(-sysctl_tcp_adv_win_scale)) :
-
space - (space>>sysctl_tcp_adv_win_scale);
-
}
阅读(1025) | 评论(0) | 转发(0) |