IP头限制一个包最大为64K,但是实际上一般没法一次发送这么大的包,以太网一般有MTU限制一次发包的大小
MTU一般为1500字节。因此发包过程涉及packet fragmentation,该流程在L3层完成
TCP把类似的操作提到了L4层,引入了MSS的概念,这个一般是根据MTU生成的。
随着网卡速度的提升,网络收发包会占用大量cpu,为了尽量减少cpu的使用,网卡集成了很多之前软件完成的功能,即卸载功能
首先看不开启所有卸载功能,发包大于1500字节的情况。
例子:负载为25000字节
__ip_append_data函数中涉及skb包的分配,length为25000+L4=25008
-
if (!skb)
-
goto alloc_new_skb;
-
-
while (length > 0) {
-
/* Check if the remaining data fits into current packet. */
-
copy = mtu - skb->len;
-
if (copy < length)
-
copy = maxfraglen - skb->len;
-
if (copy <= 0) {
-
char *data;
-
unsigned int datalen;
-
unsigned int fraglen;
-
unsigned int fraggap;
-
unsigned int alloclen;
-
struct sk_buff *skb_prev;
-
alloc_new_skb:
-
skb_prev = skb;
-
if (skb_prev)
-
fraggap = skb_prev->len - maxfraglen;
-
else
-
fraggap = 0;
-
-
/*
-
* If remaining data exceeds the mtu,
-
* we know we need more fragment(s).
-
*/
-
datalen = length + fraggap;
-
if (datalen > mtu - fragheaderlen)
-
datalen = maxfraglen - fragheaderlen; //maxfraglen=1500,fragheaderlen=20,datalen=1480
-
fraglen = datalen + fragheaderlen;
-
-
if ((flags & MSG_MORE) &&
-
!(rt->dst.dev->features&NETIF_F_SG))
-
alloclen = mtu;
-
else
-
alloclen = fraglen; //alloclen=1500
-
-
alloclen += exthdrlen;
-
-
/* The last fragment gets additional space at tail.
-
* Note, with MSG_MORE we overallocate on fragments,
-
* because we have no idea what fragment will be
-
* the last.
-
*/
-
if (datalen == length + fraggap)
-
alloclen += rt->dst.trailer_len;
-
-
if (transhdrlen) {
-
skb = sock_alloc_send_skb(sk,
-
alloclen + hh_len + 15, //第一个skb分配1500+16+15,在__alloc_skb,该大小会和L1 cacheline对齐(64),即变成1536
-
(flags & MSG_DONTWAIT), &err);
-
} else {
-
skb = NULL;
-
if (atomic_read(&sk->sk_wmem_alloc) <=
-
2 * sk->sk_sndbuf)
-
skb = sock_wmalloc(sk,
-
alloclen + hh_len + 15, 1,
-
sk->sk_allocation);
-
if (unlikely(skb == NULL))
-
err = -ENOBUFS;
-
else
-
/* only the initial fragment is
-
time stamped */
-
cork->tx_flags = 0;
-
}
-
if (skb == NULL)
-
goto error;
-
-
/*
-
* Fill in the control structures
-
*/
-
skb->ip_summed = csummode;
-
skb->csum = 0;
-
skb_reserve(skb, hh_len);
-
skb_shinfo(skb)->tx_flags = cork->tx_flags;
-
-
/*
-
* Find where to start putting bytes.
-
*/
-
data = skb_put(skb, fraglen + exthdrlen); //skb->len+=1500
-
skb_set_network_header(skb, exthdrlen);
-
skb->transport_header = (skb->network_header +
-
fragheaderlen);
-
data += fragheaderlen + exthdrlen;
-
-
if (fraggap) {
-
skb->csum = skb_copy_and_csum_bits(
-
skb_prev, maxfraglen,
-
data + transhdrlen, fraggap, 0);
-
skb_prev->csum = csum_sub(skb_prev->csum,
-
skb->csum);
-
data += fraggap;
-
pskb_trim_unique(skb_prev, maxfraglen);
-
}
-
-
copy = datalen - transhdrlen - fraggap; //copy=MTU-L3-L4=1472
-
//第二个以后的分片不再区别L4,每次拷贝1480字节
-
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
-
err = -EFAULT;
-
kfree_skb(skb);
-
goto error;
-
}
-
-
offset += copy;
-
length -= datalen - fraggap;
-
transhdrlen = 0;
-
exthdrlen = 0;
-
csummode = CHECKSUM_NONE;
-
-
/*
-
* Put the packet on the pending queue.
-
*/
-
__skb_queue_tail(queue, skb);
-
continue;
-
}
分配多个skb,拷贝数据,注意length包含L4的头,第二个开始的skb不再区别L4头。
其中涉及SG卸载功能的地方有:
-
if ((flags & MSG_MORE) &&
-
!(rt->dst.dev->features&NETIF_F_SG))
-
alloclen = mtu;
-
else
-
alloclen = fraglen
即有SG功能的话,每次都是按需分配,如果没有SG的功能,同时又MORE的标志的话,按最大分段大小进行分配,这么做的原因在于SG可以支持数据包对应的内存不连续
-
if (!(rt->dst.dev->features&NETIF_F_SG)) {
-
unsigned int off;
-
-
off = skb->len;
-
if (getfrag(from, skb_put(skb, copy),
-
offset, copy, off, skb) < 0) {
-
__skb_trim(skb, off);
-
err = -EFAULT;
-
goto error;
-
}
-
} else {
-
int i = skb_shinfo(skb)->nr_frags;
-
skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
-
struct page *page = cork->page;
-
int off = cork->off;
-
unsigned int left;
-
-
if (page && (left = PAGE_SIZE - off) > 0) {
-
if (copy >= left)
-
copy = left;
-
if (page != frag->page) {
-
if (i == MAX_SKB_FRAGS) {
-
err = -EMSGSIZE;
-
goto error;
-
}
-
get_page(page);
-
skb_fill_page_desc(skb, i, page, off, 0);
-
frag = &skb_shinfo(skb)->frags[i];
-
}
-
} else if (i < MAX_SKB_FRAGS) {
-
if (copy > PAGE_SIZE)
-
copy = PAGE_SIZE;
-
page = alloc_pages(sk->sk_allocation, 0);
-
if (page == NULL) {
-
err = -ENOMEM;
-
goto error;
-
}
-
cork->page = page;
-
cork->off = 0;
-
-
skb_fill_page_desc(skb, i, page, 0, 0);
-
frag = &skb_shinfo(skb)->frags[i];
-
} else {
-
err = -EMSGSIZE;
-
goto error;
-
}
-
if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
-
err = -EFAULT;
-
goto error;
-
}
-
cork->off += copy;
-
frag->size += copy;
-
skb->len += copy;
-
skb->data_len += copy;
-
skb->truesize += copy;
-
atomic_add(copy, &sk->sk_wmem_alloc);
-
}
-
offset += copy;
-
length -= copy;
-
}
可以看到不管支不支持SG,都涉及内存的拷贝,只不过拷贝的目的地不一样而已。
理论上SG使得每个SKB支持16个页,大小为64K,但是实际上__ip_append_data分片的规格是MTU,大于MTU就会重新分配一个SKB
函数__ip_make_skb除了填充L3头外,还对skb的组织进行了修改:
所有的大小都会在第一个skb中体现,即本例中skb->len=25028,即25000+L3+L4
后续调用ip_finish_output进行发送,这里涉及另外一个卸载功能:GSO
-
static int ip_finish_output(struct sk_buff *skb)
-
{
-
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
-
/* Policy lookup after SNAT yielded a new policy */
-
if (skb_dst(skb)->xfrm != NULL) {
-
IPCB(skb)->flags |= IPSKB_REROUTED;
-
return dst_output(skb);
-
}
-
#endif
-
if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
-
return ip_fragment(skb, ip_finish_output2);
-
else
-
return ip_finish_output2(skb);
-
}
如果包的长度大于MTU,并且不支持GSO的话,先调用ip_fragment,否则直接调用ip_finish_output2
-
int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
-
{
-
struct iphdr *iph;
-
int ptr;
-
struct net_device *dev;
-
struct sk_buff *skb2;
-
unsigned int mtu, hlen, left, len, ll_rs;
-
int offset;
-
__be16 not_last_frag;
-
struct rtable *rt = skb_rtable(skb);
-
int err = 0;
-
-
dev = rt->dst.dev;
-
-
/*
-
* Point into the IP datagram header.
-
*/
-
-
iph = ip_hdr(skb);
-
-
if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
-
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
-
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-
htonl(ip_skb_dst_mtu(skb)));
-
kfree_skb(skb);
-
return -EMSGSIZE;
-
}
-
-
/*
-
* Setup starting values.
-
*/
-
-
hlen = iph->ihl * 4;
-
mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
-
#ifdef CONFIG_BRIDGE_NETFILTER
-
if (skb->nf_bridge)
-
mtu -= nf_bridge_mtu_reduction(skb);
-
#endif
-
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
-
-
/* When frag_list is given, use it. First, check its validity:
-
* some transformers could create wrong frag_list or break existing
-
* one, it is not prohibited. In this case fall back to copying.
-
*
-
* LATER: this step can be merged to real generation of fragments,
-
* we can switch to copy when see the first bad fragment.
-
*/
-
if (skb_has_frag_list(skb)) {
-
struct sk_buff *frag, *frag2;
-
int first_len = skb_pagelen(skb); //pagelen包含主buffer以及fragments的大小,不包含frag_list
-
-
if (first_len - hlen > mtu || // 本例中skb不包含fragments,只有frag_list,first_len=mtu+hlen
-
((first_len - hlen) & 7) || //条件头不满足,走fast流程
-
ip_is_fragment(iph) ||
-
skb_cloned(skb))
-
goto slow_path;
-
-
skb_walk_frags(skb, frag) { //对每个frag进行检查
-
/* Correct geometry. */
-
if (frag->len > mtu ||
-
((frag->len & 7) && frag->next) ||
-
skb_headroom(frag) < hlen)
-
goto slow_path_clean;
-
-
/* Partially cloned skb? */
-
if (skb_shared(frag))
-
goto slow_path_clean;
-
-
BUG_ON(frag->sk);
-
if (skb->sk) {
-
frag->sk = skb->sk;
-
frag->destructor = sock_wfree;
-
}
-
skb->truesize -= frag->truesize;
-
}
-
-
/* Everything is OK. */
-
-
err = 0;
-
offset = 0;
-
frag = skb_shinfo(skb)->frag_list;
-
skb_frag_list_init(skb);
-
skb->data_len = first_len - skb_headlen(skb); //data_len=0
-
skb->len = first_len; //len=1500
-
iph->tot_len = htons(first_len);
-
iph->frag_off = htons(IP_MF);
-
ip_send_check(iph);
-
-
for (;;) {
-
/* Prepare header of the next frame,
-
* before previous one went down. */
-
if (frag) {
-
frag->ip_summed = CHECKSUM_NONE;
-
skb_reset_transport_header(frag);
-
__skb_push(frag, hlen);
-
skb_reset_network_header(frag);
-
memcpy(skb_network_header(frag), iph, hlen); //赋值IP头
-
iph = ip_hdr(frag);
-
iph->tot_len = htons(frag->len);
-
ip_copy_metadata(frag, skb);
-
if (offset == 0)
-
ip_options_fragment(frag);
-
offset += skb->len - hlen;
-
iph->frag_off = htons(offset>>3);
-
if (frag->next != NULL)
-
iph->frag_off |= htons(IP_MF);
-
/* Ready, complete checksum */
-
ip_send_check(iph);
-
}
-
-
err = output(skb); //调用ip_finish_output2,除了第一个skb有L4头外,其他的skb没有
-
-
if (!err)
-
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
-
if (err || !frag)
-
break;
-
-
skb = frag;
-
frag = skb->next;
-
skb->next = NULL;
-
}
-
-
if (err == 0) {
-
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
-
return 0;
-
}
-
-
while (frag) {
-
skb = frag->next;
-
kfree_skb(frag);
-
frag = skb;
-
}
-
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
-
return err;
-
-
slow_path_clean:
-
skb_walk_frags(skb, frag2) {
-
if (frag2 == frag)
-
break;
-
frag2->sk = NULL;
-
frag2->destructor = NULL;
-
skb->truesize += frag2->truesize;
-
}
-
}
-
-
slow_path:
-
left = skb->len - hlen; /* Space per frame */
-
ptr = hlen; /* Where to start from */
-
-
/* for bridged IP traffic encapsulated inside f.e. a vlan header,
-
* we need to make room for the encapsulating header
-
*/
-
ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
-
-
/*
-
* Fragment the datagram.
-
*/
-
-
offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
-
not_last_frag = iph->frag_off & htons(IP_MF);
-
-
/*
-
* Keep copying data until we run out.
-
*/
-
-
while (left > 0) {
-
len = left;
-
/* IF: it doesn't fit, use 'mtu' - the data space left */
-
if (len > mtu)
-
len = mtu;
-
/* IF: we are not sending up to and including the packet end
-
then align the next start on an eight byte boundary */
-
if (len < left) {
-
len &= ~7;
-
}
-
/*
-
* Allocate buffer.
-
*/
-
-
if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
-
NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
-
err = -ENOMEM;
-
goto fail;
-
}
-
-
/*
-
* Set up data on packet
-
*/
-
-
ip_copy_metadata(skb2, skb);
-
skb_reserve(skb2, ll_rs);
-
skb_put(skb2, len + hlen);
-
skb_reset_network_header(skb2);
-
skb2->transport_header = skb2->network_header + hlen;
-
-
/*
-
* Charge the memory for the fragment to any owner
-
* it might possess
-
*/
-
-
if (skb->sk)
-
skb_set_owner_w(skb2, skb->sk);
-
-
/*
-
* Copy the packet header into the new buffer.
-
*/
-
-
skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
-
-
/*
-
* Copy a block of the IP datagram.
-
*/
-
if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
-
BUG();
-
left -= len;
-
-
/*
-
* Fill in the new header fields.
-
*/
-
iph = ip_hdr(skb2);
-
iph->frag_off = htons((offset >> 3));
-
-
/* ANK: dirty, but effective trick. Upgrade options only if
-
* the segment to be fragmented was THE FIRST (otherwise,
-
* options are already fixed) and make it ONCE
-
* on the initial skb, so that all the following fragments
-
* will inherit fixed options.
-
*/
-
if (offset == 0)
-
ip_options_fragment(skb);
-
-
/*
-
* Added AC : If we are fragmenting a fragment that's not the
-
* last fragment then keep MF on each bit
-
*/
-
if (left > 0 || not_last_frag)
-
iph->frag_off |= htons(IP_MF);
-
ptr += len;
-
offset += len;
-
-
/*
-
* Put this fragment into the sending queue.
-
*/
-
iph->tot_len = htons(len + hlen);
-
-
ip_send_check(iph);
-
-
err = output(skb2);
-
if (err)
-
goto fail;
-
-
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
-
}
-
kfree_skb(skb);
-
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
-
return err;
-
-
fail:
-
kfree_skb(skb);
-
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
-
return err;
-
}
可以看到fast流程就是对没有skb增加L3头,然后调用ip_finish_output2发送。
阅读(3047) | 评论(0) | 转发(0) |