2015年(4)
分类: 嵌入式
2015-08-06 14:09:37
原文地址:linux 内核tcp数据发送的实现 作者:lwchsz
static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { struct sock_iocb *si = kiocb_to_siocb(iocb); int err; si->sock = sock; si->scm = NULL; si->msg = msg; si->size = size; err = security_socket_sendmsg(sock, msg, size); if (err) return err; ///这里就会调用tcp_sendmsg. return sock->ops->sendmsg(iocb, sock, msg, size); }
///首先取出句柄的flag,主要是看是非阻塞还是阻塞模式. flags = msg->msg_flags; ///这里取得发送超时时间. timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); ///如果connect还没有完成则等待连接完成(如是非阻塞则直接返回). if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) goto out_err; /* This should be in poll */ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); ///取出当前的mss,在tcp_current_mss还会设置xmit_size_goal,这个值一般都是等于mss,除非有gso的情况下,有所不同.这里我们就认为他是和mms相等的. mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); size_goal = tp->xmit_size_goal;
!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0
/* Ok commence sending. */ iovlen = msg->msg_iovlen; iov = msg->msg_iov; ///copy的大小 copied = 0; err = -EPIPE; ///如果发送端已经完全关闭则返回,并设置errno. if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; while (--iovlen >= 0) { ///取得当前buf长度 int seglen = iov->iov_len; ///buf的基地址. unsigned char __user *from = iov->iov_base; iov++; while (seglen > 0) { int copy; ///我们知道sock的发送队列sk_write_queue是一个双向链表,而用tcp_write_queue_tail则是取得链表的最后一个元素.(如果链表为空则返回NULL). skb = tcp_write_queue_tail(sk); ///上面介绍过了.主要是判断buf是否有空闲空间. if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { new_segment: ///开始alloc一个新的段. if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; ///alloc的大小一般都是等于mss的大小,这里是通过select_size得到的. skb = sk_stream_alloc_skb(sk, select_size(sk), sk->sk_allocation); if (!skb) goto wait_for_memory; /* * Check whether we can use HW checksum. */ if (sk->sk_route_caps & NETIF_F_ALL_CSUM) skb->ip_summed = CHECKSUM_PARTIAL; ///将这个skb加入到sk_write_queue队列中,并更新sk_send_head域. skb_entail(sk, skb); ///将copy值更新. copy = size_goal; }
static inline int skb_is_nonlinear(const struct sk_buff *skb) { return skb->data_len; } static inline int skb_tailroom(const struct sk_buff *skb) { ///如果是新alloc的skb则会返回tailroom否则返回0 return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail; }
while (--iovlen >= 0) { ........................... while (seglen > 0) { ///如果copy大于buf的大小,则缩小copy. if (copy > seglen) copy = seglen; ///这里查看skb的空间.如果大于0,则说明是新建的skb. if (skb_tailroom(skb) > 0) { ///如果需要复制的数据大于所剩的空间,则先复制当前skb所能容纳的大小. if (copy > skb_tailroom(skb)) copy = skb_tailroom(skb); ///复制数据到sk_buff.大小为copy.如果成功进入do_fault,(我们下面会分析) if ((err = skb_add_data(skb, from, copy)) != 0) goto do_fault; }
while (--iovlen >= 0) { ........................... while (seglen > 0) { ............................... else { int merge = 0; ///取得nr_frags也就是保存物理页的数组. int i = skb_shinfo(skb)->nr_frags; ///从socket取得当前的发送物理页. struct page *page = TCP_PAGE(sk); ///取得当前页的位移. int off = TCP_OFF(sk); ///这里主要是判断skb的发送页是否已经存在于nr_frags中,如果存在并且也没有满,则我们只需要将数据合并到这个页就可以了,而不需要在frag再添加一个页. if (skb_can_coalesce(skb, i, page, off) && off != PAGE_SIZE) { merge = 1; } else if (i == MAX_SKB_FRAGS || (!i && !(sk->sk_route_caps & NETIF_F_SG))) { ///到这里说明要么设备不支持SG IO,要么页已经满了.因为我们知道nr_frags的大小是有限制的.此时调用tcp_mark_push来加一个PSH标记. tcp_mark_push(tp, skb); goto new_segment; } else if (page) { if (off == PAGE_SIZE) { ///这里说明当前的发送页已满. put_page(page); TCP_PAGE(sk) = page = NULL; off = 0; } } else off = 0; if (copy > PAGE_SIZE - off) copy = PAGE_SIZE - off; ................................. ///如果page为NULL则需要新alloc一个物理页. if (!page) { /* Allocate new cache page. */ if (!(page = sk_stream_alloc_page(sk))) goto wait_for_memory; } ///开始复制数据到这个物理页. err = skb_copy_to_page(sk, from, skb, page, off, copy); if (err) { ///出错的情况. if (!TCP_PAGE(sk)) { TCP_PAGE(sk) = page; TCP_OFF(sk) = 0; } goto do_error; } ///判断是否为新建的物理页. if (merge) { ///如果只是在存在的物理页添加数据,则只需要更新size skb_shinfo(skb)->frags[i - 1].size += copy; } else { ///负责添加此物理页到skb的frags. skb_fill_page_desc(skb, i, page, off, copy); if (TCP_PAGE(sk)) { ///设置物理页的引用计数. get_page(page); } else if (off + copy < PAGE_SIZE) { get_page(page); TCP_PAGE(sk) = page; } } ///设置位移. TCP_OFF(sk) = off + copy; }
static inline void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle) void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle) static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
static inline void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_send_head(sk)) { struct sk_buff *skb = tcp_write_queue_tail(sk); ///MSG_MORE这个参数我们在ip_append_data那里已经介绍过了,就是告诉ip层,我这里主要是一些小的数据包,然后ip层就会提前划分一个mtu大小的buf,然后等待数据的到来.因此如果没有设置这个或者forced_push返回真(我们写了超过最大窗口一般的数据),就标记一个PSH. if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); tcp_mark_urg(tp, flags, skb); ///这里还是根据是否有设置MSG_MORE来判断使用哪个flags.因此可以看到如果我们设置了tcp_cork套接字选项和设置msg的MSG_MORE比较类似.最终调用tcp_push都会传递给__tcp_push_pending_frames的参数为TCP_NAGLE_CORK . __tcp_push_pending_frames(sk, mss_now, (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); } }
///这个函数就不介绍了,内核的注释很详细. /* Return 0, if packet can be sent now without violation Nagle's rules: * 1. It is full sized. * 2. Or it contains FIN. (already checked by caller) * 3. Or TCP_NODELAY was set. * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ static inline int tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned mss_now, int nonagle) { return (skb->len < mss_now && ((nonagle & TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); } static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss, int nonagle) { ///如果设置了TCP_NAGLE_PUSH则返回1,也就是数据可以立即发送 if (nonagle & TCP_NAGLE_PUSH) return 1; /* Don't use the nagle rule for urgent data (or for the final FIN). * Nagle can be ignored during F-RTO too (see RFC4138). */ if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) return 1; ///再次检测 nonagle域,相关的检测,上面已经说明了. if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) return 1; return 0; }
static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; ///检测状态. if (unlikely(sk->sk_state == TCP_CLOSE)) return 0; sent_pkts = 0; ///探测mtu. if ((result = tcp_mtu_probe(sk)) == 0) { return 0; } else if (result > 0) { sent_pkts = 1; } ///开始处理数据包. while ((skb = tcp_send_head(sk))) { unsigned int limit; tso_segs = tcp_init_tso_segs(sk, skb, mss_now); BUG_ON(!tso_segs); ///主要用来测试congestion window.. cwnd_quota = tcp_cwnd_test(tp, skb); if (!cwnd_quota) break; if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) break; if (tso_segs == 1) { ///主要看这里,如果这个skb是写队列的最后一个buf,则传输TCP_NAGLE_PUSH给tcp_nagle_test,这个时侯直接返回1,于是接着往下面走,否则则说明数据包不要求理解发送,我们就跳出循环(这时数据段就不会被发送).比如设置了TCP_CORK. if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) break; } else { if (tcp_tso_should_defer(sk, skb)) break; } limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, cwnd_quota); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now))) break; TCP_SKB_CB(skb)->when = tcp_time_stamp; ///传输数据给3层. if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC))) break; /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts++; } if (likely(sent_pkts)) { tcp_cwnd_validate(sk); return 0; } return !tp->packets_out && tcp_send_head(sk); }
while (--iovlen >= 0) { ........................... while (seglen > 0) { ............................... ///如果第一次组完一个段,则设置PSH. if (!copied) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; ///然后设置写队列长度. tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; skb_shinfo(skb)->gso_segs = 0; ///更新buf基地址以及复制的buf大小. from += copy; copied += copy; ///buf已经复制完则退出循环.并发送这个段. if ((seglen -= copy) == 0 && iovlen == 0) goto out; ///如果skb的数据大小小于所需拷贝的数据大小或者存在带外数据,我们继续循环,而当存在带外数据时,我们接近着的循环会退出循环,然后调用tcp_push将数据发出. if (skb->len < size_goal || (flags & MSG_OOB)) continue; ///forced_push用来判断我们是否已经写了多于一半窗口大小的数据到对端.如果是,我们则要发送一个推数据(PSH). if (forced_push(tp)) { tcp_mark_push(tp, skb); ///调用__tcp_push_pending_frames将开启NAGLE算法的缓存的段全部发送出去. __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); } else if (skb == tcp_send_head(sk)) ///如果当前将要发送的buf刚好为skb,则会传发送当前的buf tcp_push_one(sk, mss_now); continue; wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: if (copied) ///内存不够,则尽量将本地的NAGLE算法所缓存的数据发送出去. tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; ///更新相关域. mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); size_goal = tp->xmit_size_goal; } }
out: ///这里是成功返回所做的. if (copied) ///这里可以看到最终的flag是tp->nonagle,而这个就是看套接口选项是否有开nagle算法,如果没开的话,立即把数据发出去,否则则会村讯nagle算法,将小数据缓存起来. tcp_push(sk, flags, mss_now, tp->nonagle); TCP_CHECK_TIMER(sk); release_sock(sk); return copied; do_fault: if (!skb->len) { ///从write队列unlink掉当前的buf. tcp_unlink_write_queue(skb, sk); ///更新send)head tcp_check_send_head(sk, skb); ///释放skb. sk_wmem_free_skb(sk, skb); } do_error: if (copied) ///如果copied不为0,则说明发送成功一部分数据,因此此时返回out. goto out; out_err: ///否则进入错误处理. err = sk_stream_error(sk, flags, err); TCP_CHECK_TIMER(sk); release_sock(sk); return err;