GRO是一种优化机制,让包尽可能在底层合并,给上层呈现一个大的数据包
驱动调用napi_gro_receive触发GRO流程
首先初始化GRO相应的分量,然后根据__napi_gro_receive的返回值做不同的处理:如果能够合并或者保留,则不需要往上层协议栈送,否则就上报。
-
gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
-
{
-
switch (ret) {
-
case GRO_NORMAL:
-
if (netif_receive_skb(skb))
-
ret = GRO_DROP;
-
break;
-
-
case GRO_DROP:
-
case GRO_MERGED_FREE:
-
kfree_skb(skb);
-
break;
-
-
case GRO_HELD:
-
case GRO_MERGED:
-
break;
-
}
-
-
return ret;
-
}
-
-
void skb_gro_reset_offset(struct sk_buff *skb)
-
{
-
NAPI_GRO_CB(skb)->data_offset = 0;
-
NAPI_GRO_CB(skb)->frag0 = NULL;
-
NAPI_GRO_CB(skb)->frag0_len = 0;
-
-
if (skb->mac_header == skb->tail && //跟驱动强相关,r8169不会走这个流程
-
!PageHighMem(skb_shinfo(skb)->frags[0].page)) {
-
NAPI_GRO_CB(skb)->frag0 =
-
page_address(skb_shinfo(skb)->frags[0].page) +
-
skb_shinfo(skb)->frags[0].page_offset;
-
NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
-
}
-
}
-
-
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
-
{
-
skb_gro_reset_offset(skb);
-
-
return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
-
}
GRO的判断分层进行,首先是判断L2层相关的内容,然后是L3、L4,先看L2层的判断:
-
static inline gro_result_t
-
__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
-
{
-
struct sk_buff *p;
-
-
for (p = napi->gro_list; p; p = p->next) {
-
unsigned long diffs;
-
-
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; //网卡设备需要一样
-
diffs |= p->vlan_tci ^ skb->vlan_tci;
-
diffs |= compare_ether_header(skb_mac_header(p), //L2头需要一样,14个字节
-
skb_gro_mac_header(skb));
-
NAPI_GRO_CB(p)->same_flow = !diffs;
-
NAPI_GRO_CB(p)->flush = 0;
-
}
-
-
return dev_gro_receive(napi, skb);
-
}
L2层处理完后调用dev_gro_receive,这个涉及L3层的处理,需要相应的L3层协议提供相应的钩子函数,IP协议满足:
-
static struct packet_type ip_packet_type __read_mostly = {
-
.type = cpu_to_be16(ETH_P_IP),
-
.func = ip_rcv,
-
.gso_send_check = inet_gso_send_check,
-
.gso_segment = inet_gso_segment,
-
.gro_receive = inet_gro_receive,
-
.gro_complete = inet_gro_complete,
-
}
后面还会涉及L4的操作,同样需要相应的协议提供钩子函数,TCP满足,而UDP没有注册类似的函数:
-
static const struct net_protocol tcp_protocol = {
-
.handler = tcp_v4_rcv,
-
.err_handler = tcp_v4_err,
-
.gso_send_check = tcp_v4_gso_send_check,
-
.gso_segment = tcp_tso_segment,
-
.gro_receive = tcp4_gro_receive,
-
.gro_complete = tcp4_gro_complete,
-
.no_policy = 1,
-
.netns_ok = 1,
-
}
看一下具体的流程:
-
enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
-
{
-
struct sk_buff **pp = NULL;
-
struct packet_type *ptype;
-
__be16 type = skb->protocol;
-
struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
-
int same_flow;
-
int mac_len;
-
enum gro_result ret;
-
-
if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) //GRO可以通过ethtool动态开启或者关闭,关闭的话直接返回NORMAL
-
goto normal;
-
-
if (skb_is_gso(skb) || skb_has_frag_list(skb))
-
goto normal;
-
-
rcu_read_lock();
-
list_for_each_entry_rcu(ptype, head, list) {
-
if (ptype->type != type || ptype->dev || !ptype->gro_receive) //只有ip_packet_type满足
-
continue;
-
-
skb_set_network_header(skb, skb_gro_offset(skb));
-
mac_len = skb->network_header - skb->mac_header;
-
skb->mac_len = mac_len;
-
NAPI_GRO_CB(skb)->same_flow = 0;
-
NAPI_GRO_CB(skb)->flush = 0;
-
NAPI_GRO_CB(skb)->free = 0;
-
-
pp = ptype->gro_receive(&napi->gro_list, skb);//inet_gro_receive
-
break;
-
}
-
rcu_read_unlock();
-
-
if (&ptype->list == head)
-
goto normal;
-
-
same_flow = NAPI_GRO_CB(skb)->same_flow;
-
ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
-
-
if (pp) {
-
struct sk_buff *nskb = *pp;
-
-
*pp = nskb->next;
-
nskb->next = NULL;
-
napi_gro_complete(nskb);
-
napi->gro_count--;
-
}
-
-
if (same_flow)
-
goto ok;
-
-
if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
-
goto normal;
-
-
napi->gro_count++;
-
NAPI_GRO_CB(skb)->count = 1;
-
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
-
skb->next = napi->gro_list;
-
napi->gro_list = skb;
-
ret = GRO_HELD;
-
-
pull:
-
if (skb_headlen(skb) < skb_gro_offset(skb)) {
-
int grow = skb_gro_offset(skb) - skb_headlen(skb);
-
-
BUG_ON(skb->end - skb->tail < grow);
-
-
memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
-
-
skb->tail += grow;
-
skb->data_len -= grow;
-
-
skb_shinfo(skb)->frags[0].page_offset += grow;
-
skb_shinfo(skb)->frags[0].size -= grow;
-
-
if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
-
put_page(skb_shinfo(skb)->frags[0].page);
-
memmove(skb_shinfo(skb)->frags,
-
skb_shinfo(skb)->frags + 1,
-
--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
-
}
-
}
-
-
ok:
-
return ret;
-
-
normal:
-
ret = GRO_NORMAL;
-
goto pull;
-
}
-
static struct sk_buff **inet_gro_receive(struct sk_buff **head,
-
struct sk_buff *skb)
-
{
-
const struct net_protocol *ops;
-
struct sk_buff **pp = NULL;
-
struct sk_buff *p;
-
const struct iphdr *iph;
-
unsigned int hlen;
-
unsigned int off;
-
unsigned int id;
-
int flush = 1;
-
int proto;
-
-
off = skb_gro_offset(skb);
-
hlen = off + sizeof(*iph);
-
iph = skb_gro_header_fast(skb, off);
-
if (skb_gro_header_hard(skb, hlen)) {
-
iph = skb_gro_header_slow(skb, hlen, off);
-
if (unlikely(!iph))
-
goto out;
-
} //最后iph指向skb对应的ip头,即skb->data
-
-
proto = iph->protocol & (MAX_INET_PROTOS - 1);
-
-
rcu_read_lock();
-
ops = rcu_dereference(inet_protos[proto]);
-
if (!ops || !ops->gro_receive)
-
goto out_unlock;
-
-
if (*(u8 *)iph != 0x45)
-
goto out_unlock;
-
-
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
-
goto out_unlock;
-
-
id = ntohl(*(__be32 *)&iph->id);
-
flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); //ip头的第3-4个字节表示长度,长度不一样或者是分片的话就flush
-
id >>= 16;
-
-
for (p = *head; p; p = p->next) {
-
struct iphdr *iph2;
-
-
if (!NAPI_GRO_CB(p)->same_flow)
-
continue;
-
-
iph2 = ip_hdr(p);
-
-
if ((iph->protocol ^ iph2->protocol) | //IP层判断same flow需要L4协议一样,tos一样,源和目的地址一样
-
(iph->tos ^ iph2->tos) |
-
((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
-
((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
-
NAPI_GRO_CB(p)->same_flow = 0;
-
continue;
-
}
-
-
/* All fields must match except length and checksum. */ //只有flow一样的情况下才会设置flush
-
NAPI_GRO_CB(p)->flush |=
-
(iph->ttl ^ iph2->ttl) |
-
((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
-
-
NAPI_GRO_CB(p)->flush |= flush;
-
}
-
-
NAPI_GRO_CB(skb)->flush |= flush;
-
skb_gro_pull(skb, sizeof(*iph)); //NAPI_GRO_CB(skb)->data_offset += len;
-
skb_set_transport_header(skb, skb_gro_offset(skb)); //skb->transport_header = skb->data + offset;
-
-
pp = ops->gro_receive(head, skb);
-
-
out_unlock:
-
rcu_read_unlock();
-
-
out:
-
NAPI_GRO_CB(skb)->flush |= flush;
-
-
return pp;
-
}
-
struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
-
{
-
struct sk_buff **pp = NULL;
-
struct sk_buff *p;
-
struct tcphdr *th;
-
struct tcphdr *th2;
-
unsigned int len;
-
unsigned int thlen;
-
__be32 flags;
-
unsigned int mss = 1;
-
unsigned int hlen;
-
unsigned int off;
-
int flush = 1;
-
int i;
-
-
off = skb_gro_offset(skb);
-
hlen = off + sizeof(*th);
-
th = skb_gro_header_fast(skb, off);
-
if (skb_gro_header_hard(skb, hlen)) {
-
th = skb_gro_header_slow(skb, hlen, off);
-
if (unlikely(!th))
-
goto out; //th指向TCP的头,即skb->data+off
-
}
-
-
thlen = th->doff * 4;
-
if (thlen < sizeof(*th))
-
goto out;
-
-
hlen = off + thlen;
-
if (skb_gro_header_hard(skb, hlen)) {
-
th = skb_gro_header_slow(skb, hlen, off);
-
if (unlikely(!th))
-
goto out;
-
}
-
-
skb_gro_pull(skb, thlen); // NAPI_GRO_CB(skb)->data_offset += len; 即等于L3+L4的头
-
-
len = skb_gro_len(skb); //skb->len - NAPI_GRO_CB(skb)->data_offset;即len表示取掉头的负载
-
flags = tcp_flag_word(th);
-
-
for (; (p = *head); head = &p->next) {
-
if (!NAPI_GRO_CB(p)->same_flow)
-
continue;
-
-
th2 = tcp_hdr(p);
-
-
if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { //tcp判断same flow的标准是源端口号一样
-
NAPI_GRO_CB(p)->same_flow = 0;
-
continue;
-
}
-
-
goto found;
-
}
-
-
goto out_check_final;
-
-
found:
-
flush = NAPI_GRO_CB(p)->flush;
-
flush |= (__force int)(flags & TCP_FLAG_CWR);
-
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
-
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
-
flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
-
for (i = sizeof(*th); i < thlen; i += 4) //option区域必须一样,否则flush
-
flush |= *(u32 *)((u8 *)th + i) ^
-
*(u32 *)((u8 *)th2 + i);
-
-
mss = skb_shinfo(p)->gso_size;
-
-
flush |= (len - 1) >= mss;
-
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
-
-
if (flush || skb_gro_receive(head, skb)) {
-
mss = 1;
-
goto out_check_final;
-
}
-
-
p = *head;
-
th2 = tcp_hdr(p);
-
tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
-
-
out_check_final:
-
flush = len < mss;
-
flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
-
TCP_FLAG_RST | TCP_FLAG_SYN |
-
TCP_FLAG_FIN));
-
-
if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
-
pp = head;
-
-
out:
-
NAPI_GRO_CB(skb)->flush |= flush;
-
-
return pp;
-
}
阅读(4138) | 评论(0) | 转发(0) |