内核ipv4的检验和处理-qtdszws-ChinaUnix博客

qtdszwsqtdszws.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

qtdszws

博客访问： 380285
博文数量： 64
博客积分： 2975
博客等级：少校
技术积分： 831
用户组：普通用户
注册时间： 2007-01-14 10:59

文章分类

全部博文（64）

文章存档

2014年（2）

2012年（7）

2010年（40）

2009年（5）

2008年（8）

2007年（2）

我的朋友

相关博文

内核ipv4的检验和处理

分类： LINUX

2008-11-18 12:33:57

网络协议规定,ip头部必须使用检验和并被检验,tcp必须使用检验和，udp可以选择是否使用检验和。

1.ip头部检验和
ip头部检验和只检验ip头部，而不包含数据部分。这样做的目的是能减少计算量，加速ip协议的处理，在路由时很有用。

1.1 ip发送时的检验和计算
int ip_build_xmit(struct sock *sk,
    int getfrag (const void *,
          char *,
          unsigned int,
          unsigned int),
    const void *frag,
    unsigned length,
    struct ipcm_cookie *ipc,
    struct rtable *rt,
    int flags)
{
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}

static inline unsigned short ip_fast_csum(unsigned char * iph,
unsigned int ihl) {
unsigned int sum;

__asm__ __volatile__("
     movl (%1), %0 #第0个双字放入%0
     subl $4, %2
     jbe 2f         #ihl<=4,error
     addl 4(%1), %0 #第1个双字加入%0
     adcl 8(%1), %0 #第2个双字加入%0
     adcl 12(%1), %0#第3个双字加入%0
1:     adcl 16(%1), %0#第4个双字加入%0
     lea 4(%1), %1 #%1后移4字节
     decl %2        #后面还有数据吗?
     jne 1b         #>0有跳到1处
     adcl $0, %0    #加入可能的进位
     movl %0, %2
     shrl $16, %0
     addw %w2, %w0 #高字和低字相加
     adcl $0, %0    #加入可能的进位
     notl %0        #取反
2:
     "
/* Since the input registers which are loaded with iph and ipl
    are modified, we must also specify them as outputs, or gcc
    will assume they contain their original values. */
: "=r" (sum), "=r" (iph), "=r" (ihl)
: "1" (iph), "2" (ihl));
return(sum);
}

1.2 ip接收时的检验和计算
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
{
if (skb->len < sizeof(struct iphdr) //实际长度小于基本ip头长 20
  || skb->len < (iph->ihl<<2))//实际长度小于实际ip头长
  goto inhdr_error;
if (iph->ihl < 5
  || iph->version != 4
  || ip_fast_csum((u8 *)iph, iph->ihl) != 0)//首部检验和
  goto inhdr_error;
}

计算结果不为0，出错

1.3 ip转发时的检验和计算
int ip_forward(struct sk_buff *skb)
{
/* Decrease ttl after skb cow done */
ip_decrease_ttl(iph);
}

ip转发时需要递减ttl,因此检验和要重新计算

/* The function in 2.2 was invalid, producing wrong result for
* check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */
static inline int ip_decrease_ttl(struct iphdr *iph)
{
u32 check = iph->check;
check += __constant_htons(0x0100);
iph->check = check + (check>=0xFFFF);
return --iph->ttl;
}
假定除去检验和之外的其他字段的检验和为x,则检验和为~x,x+~x=-1,经过ip_fast_csum计算后正好为0.
现在ttl--,相当于x=x-__constant_htons(0x0100),为了保证ip_fast_csum正确，只需(x-__constant_htons(0x0100))+(~x+__constant_htons(0x0100))=-1,所以新的检验和为~x+__constant_htons(0x0100),即check += __constant_htons(0x0100);
当check>=0xfeff,check += __constant_htons(0x0100)后，check>=0xffff,且只可能进一位，将近位重新加入check
iph->check = check + (check>=0xFFFF);

2.udp检验和
udp检验和是可选的，当检验和字段为0时，不检验，否则检验。如果检验的话，会附加一个伪首部一起检验。当检验和为0时，用0xffff代替,这样x+~x=-1,就成了0xffff+0xffff=0x1FFFE=0x1+0xfffe=0xffff=-1,仍然正确

2.1 udp发送检验和计算
int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
ufh.uh.len = htons(ulen);
ufh.uh.check = 0;
ufh.iov = msg->msg_iov;
ufh.wcheck = 0;

err = ip_build_xmit(sk,
       (sk->no_check == UDP_CSUM_NOXMIT ?
        udp_getfrag_nosum ://复制用户数据到内核但不计算检验和
        udp_getfrag),//复制用户数据到内核并计算检验和
       &ufh, ulen, &ipc, rt, msg->msg_flags);
}

如果sk->no_check设为
/* Note: this must match 'valbool' in sock_setsockopt */
#define UDP_CSUM_NOXMIT 1
则不检验，否则检验.sk->no_check可以用

int sock_setsockopt(struct socket *sock, int level, int optname,
      char *optval, int optlen)
{
case SO_NO_CHECK:
   sk->no_check = valbool;
   break;
}
修改

static int udp_getfrag_nosum(const void *p, char * to, unsigned int offset, unsigned int fraglen)
{
//p指向udp伪首部
//to复制目的地址
//offset从udp首部开始的偏移
//fraglen本次复制的数据长度
//被分片的udp数据包只有第一个分片有udp头，其它的都没有

struct udpfakehdr *ufh = (struct udpfakehdr *)p;//指向udp伪首部

if (offset==0) {//需要复制首部
  memcpy(to, ufh, sizeof(struct udphdr));//只复制udp首部
  return memcpy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset,
        fraglen-sizeof(struct udphdr));
}
return memcpy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr),
       fraglen);
}

static int udp_getfrag(const void *p, char * to, unsigned int offset, unsigned int fraglen)
{
//参看udp_getfrag_nosum
struct udpfakehdr *ufh = (struct udpfakehdr *)p;
if (offset==0) {
  if (csum_partial_copy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset,
         fraglen-sizeof(struct udphdr), &ufh->wcheck))//复制并检验数据部分
   return -EFAULT;
   ufh->wcheck = csum_partial((char *)ufh, sizeof(struct udphdr),
        ufh->wcheck);//检验udp首部
  ufh->uh.check = csum_tcpudp_magic(ufh->saddr, ufh->daddr,
       ntohs(ufh->uh.len),
       IPPROTO_UDP, ufh->wcheck);//检验伪首部
  if (ufh->uh.check == 0)
   ufh->uh.check = -1;
  memcpy(to, ufh, sizeof(struct udphdr));
  return 0;
}
if (csum_partial_copy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr),
        fraglen, &ufh->wcheck))
  return -EFAULT;
return 0;
}
int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
     int offset, unsigned int len, int *csump)
{
int csum = *csump;
int partial_cnt = 0, err = 0;

/* Skip over the finished iovecs */
while (offset >= iov->iov_len)//跳过前面的数据
{
offset -= iov->iov_len;
iov++;
}

while (len > 0)
{
  u8 *base = iov->iov_base + offset;
  unsigned int copy = min(len, iov->iov_len - offset);//本次复制的长度
  offset = 0;
  /* There is a remnant剩余的, 残留的 from previous上个iov iov. */
  if (partial_cnt)// 1,2,3
  {
   int par_len = 4 - partial_cnt;//本iov需要复制的填充数据
   //par_len 3,2,1
   /* iov component is too short ... */
   if (par_len > copy) {//本次复制数据少于par_len
    if (copy_from_user(kdata, base, copy))
     goto out_fault;
    kdata += copy;
    base += copy;
    //partial_cnt+par_len=4,par_len>copy,partial_cnt+copy<4
    partial_cnt += copy;//未检验数据长
    len   -= copy;//已复制
    iov++;//当len>copy,本iov数据不足par_len,当len==copy,len==0,后面会跳出goto out
    if (len)//还有需要复制的数据
     continue;
    *csump = csum_partial(kdata - partial_cnt,
        partial_cnt, csum);//计算检验和
    goto out;
   }
   if (copy_from_user(kdata, base, par_len))//复制头部
    goto out_fault;
   //计算前一个iovec尾部+后一个iovec头部
   csum = csum_partial(kdata - partial_cnt, 4, csum);
   kdata += par_len;
   base += par_len;
   copy -= par_len;
   len   -= par_len;
   partial_cnt = 0;
  }

  if (len > copy)//len为未拷贝数
  {
   partial_cnt = copy % 4;//是否不是4字节对齐
   if (partial_cnt)//不是
   {
    copy -= partial_cnt;
    if (copy_from_user(kdata + copy, base + copy,
       partial_cnt))//把尾部数据先复制
     goto out_fault;
   }
  }

  if (copy) {
   csum = csum_and_copy_from_user(base, kdata, copy,
       csum, &err);//再复制前面,并计算检验和
   if (err)
    goto out;
  }
  len   -= copy + partial_cnt;//已复制数据
  kdata += copy + partial_cnt;//内核数据区指针后移
  iov++;
}
        *csump = csum;
out:
return err;

out_fault:
err = -EFAULT;
goto out;
}

函数声明asmlinkage unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum);
定义arch/i386/lib/checsum.S

/*
2006.9.13
* INET  An implementation of the TCP/IP protocol suite for the LINUX
*  operating system. INET is implemented using the BSD Socket
*  interface as the means of communication with the user level.
*
*  IP/TCP/UDP checksumming routines
*
* Authors: Jorge Cwik, <>
*  Arnt Gulbrandsen, <>
*  Tom May, <>
*              Pentium Pro/II routines:
*              Alexander Kjeldaas <>
*              Finn Arne Gangstad <>
*  Lots of code moved from tcp.c and ip.c; see those files
*  for more names.
*
* Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
*        handling.
*  Andi Kleen, add zeroing on error
*                   converted to pure assembler
*
*  This program is free software; you can redistribute it and/or
*  modify it under the terms of the GNU General Public License
*  as published by the Free Software Foundation; either version
*  2 of the License, or (at your option) any later version.
*/

#include
#include

/*
* computes a partial checksum, e.g. for TCP/UDP fragments
计算啊部分检验和，例如针对TCP/UDP分片
*/

/*
unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
*/

.text
.align 4
.globl csum_partial

#ifndef CONFIG_X86_USE_PPRO_CHECKSUM //Pentium Pro?

   /*
    * Experiments with Ethernet and SLIP connections show that buff
    针对以太网和SLIP连接的实验显示
    * is aligned on either a 2-byte or 4-byte boundary. We get at
    buff被对齐到2字节或4字节边界
    * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
     如果是4字节对齐的化，我们在486和奔腾上至少能够获得2倍的加速
    * Fortunately, it is easy to convert 2-byte alignment to 4-byte
    幸运的是，很容易从2字节对齐转换到4字节对齐
    * alignment for the unrolled 解开, 打开loop.
    */
csum_partial:
//不使用栈帧
pushl %esi
pushl %ebx
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: unsigned char *buff
testl $2, %esi  # Check alignment. and操作
jz 2f   # Jump if alignment is ok.4字节对齐
subl $2, %ecx  # Alignment uses up two bytes. 2字节对齐
jae 1f   # Jump if we had at least two bytes. above or equal
addl $2, %ecx  # ecx was < 2. Deal with it. 直接处理
jmp 4f
1: movw (%esi), %bx #处理头部的两个字节
addl $2, %esi
addw %bx, %ax
adcl $0, %eax #不可能两次溢出
2:
movl %ecx, %edx #保存len
shrl $5, %ecx #右移5位，一次处理32字节
jz 2f #不足32字节
testl %esi, %esi #?????清进位标志
1: movl (%esi), %ebx
adcl %ebx, %eax #eax前面被初始化为sum
movl 4(%esi), %ebx
adcl %ebx, %eax
movl 8(%esi), %ebx
adcl %ebx, %eax
movl 12(%esi), %ebx
adcl %ebx, %eax
movl 16(%esi), %ebx
adcl %ebx, %eax
movl 20(%esi), %ebx
adcl %ebx, %eax
movl 24(%esi), %ebx
adcl %ebx, %eax
movl 28(%esi), %ebx
adcl %ebx, %eax
#处理32个字节了，指针后移32字节
lea 32(%esi), %esi
dec %ecx
jne 1b
adcl $0, %eax #进位
2: movl %edx, %ecx #还原len，处理后面不足32字节的数据
andl $0x1c, %edx # 1 1100
je 4f # 只有1,2,3这几种可能
shrl $2, %edx  # This clears CF,shr指令清CF 4字节的倍数
3: adcl (%esi), %eax #一次处理四个字节
lea 4(%esi), %esi
dec %edx
jne 3b
adcl $0, %eax
4: andl $3, %ecx #1,2,3
jz 7f #处理完毕
cmpl $2, %ecx
jb 5f #below 1
movw (%esi),%cx # 2,3
leal 2(%esi),%esi
je 6f # 2
shll $16,%ecx #ecx清0
5: movb (%esi),%cl #ecx高位都为0
6: addl %ecx,%eax
adcl $0, %eax
7:
popl %ebx
popl %esi
ret

#else

/* Version for PentiumII/PPro */

csum_partial:
pushl %esi
pushl %ebx
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: const unsigned char *buf

testl $2, %esi #2字节边界
jnz 30f
10: #四字节边界
#len被分成三段,32字节的倍数被存入ecx,剩下的长度4字节的倍数被存入ebx,还有剩下不足4字节的长度
movl %ecx, %edx #保存len
movl %ecx, %ebx
andl $0x7c, %ebx #0111 1100 #处理不足32字节但大于3字节的部分
shrl $7, %ecx #ecx中保存的是32字节的倍数
addl %ebx,%esi
shrl $2, %ebx #双字个数
negl %ebx
lea 45f(%ebx,%ebx,2), %ebx #%ebx+%ebx*2+45f，用来索引下面的adc代码数组
testl %esi, %esi #???????清进位标志
jmp *%ebx #处理不足32字节部分

# Handle 2-byte-aligned regions
20: addw (%esi), %ax
lea 2(%esi), %esi
adcl $0, %eax
jmp 10b

30: subl $2, %ecx
ja 20b #len>2
je 32f #len==2
movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
addl %ebx, %eax
adcl $0, %eax
jmp 80f
32:
addw (%esi), %ax # csumming 2 bytes, 2-aligned ????不影响最后的结果???
adcl $0, %eax
jmp 80f

40:
addl -128(%esi), %eax
adcl -124(%esi), %eax #每个指令占3字节,减少寄存器争用，增加流水线并行度
adcl -120(%esi), %eax
adcl -116(%esi), %eax
adcl -112(%esi), %eax
adcl -108(%esi), %eax
adcl -104(%esi), %eax
adcl -100(%esi), %eax
adcl -96(%esi), %eax
adcl -92(%esi), %eax
adcl -88(%esi), %eax
adcl -84(%esi), %eax
adcl -80(%esi), %eax
adcl -76(%esi), %eax
adcl -72(%esi), %eax
adcl -68(%esi), %eax
adcl -64(%esi), %eax
adcl -60(%esi), %eax
adcl -56(%esi), %eax
adcl -52(%esi), %eax
adcl -48(%esi), %eax
adcl -44(%esi), %eax
adcl -40(%esi), %eax
adcl -36(%esi), %eax
adcl -32(%esi), %eax
adcl -28(%esi), %eax
adcl -24(%esi), %eax
adcl -20(%esi), %eax
adcl -16(%esi), %eax
adcl -12(%esi), %eax
adcl -8(%esi), %eax
adcl -4(%esi), %eax
45:
lea 128(%esi), %esi #然后一次处理32字节
adcl $0, %eax
dec %ecx
jge 40b
#处理完毕
movl %edx, %ecx #还原len
50: andl $3, %ecx #是否有不足4字节部分
jz 80f

# Handle the last 1-3 bytes without jumping
notl %ecx  # 1->2, 2->1, 3->0, higher bits are masked
# 11111111 11111111 11111111 11111110    1
# 11111111 11111111 11111111 11111101    2
# 11111111 11111111 11111111 11111100    3
movl $0xffffff,%ebx # by the shll and shrl instructions 1右移16位 2右移8位 3不动
# 00000000 11111111 11111111 11111111    ebx
shll $3,%ecx
# 11111111 11111111 11111111 11110000    1
# 11111111 11111111 11111111 11101000    2
# 11111111 11111111 11111111 11100000    3
shrl %cl,%ebx # 移动cl mod 32位
# 00000000 00000000 00000000 11111111    1
# 00000000 00000000 11111111 11111111    2
# 00000000 11111111 11111111 11111111    3
andl -128(%esi),%ebx # esi is 4-aligned so should be ok
addl %ebx,%eax
adcl $0,%eax
80:
popl %ebx
popl %esi
ret

#endif

/*
unsigned int csum_partial_copy_generic (const char *src, char *dst,
int len, int sum, int *src_err_ptr, int *dst_err_ptr)
*/

/*
* Copy from ds while checksumming, otherwise like csum_partial
*
* The macros SRC and DST specify the type of access for the instruction.
* thus we can call a custom exception handler for all access types.
*
* FIXME: could someone double-check whether I haven't mixed up some SRC and
* DST definitions? It's damn hard to trigger all cases. I hope I got
* them all but there's no guarantee.
*/

#define SRC(y...) \
9999: y; \
.section __ex_table, "a"; \
.long 9999b, 6001f ; \
.previous

#define DST(y...) \
9999: y; \
.section __ex_table, "a"; \
.long 9999b, 6002f ; \
.previous

.align 4
.globl csum_partial_copy_generic

#ifndef CONFIG_X86_USE_PPRO_CHECKSUM

#define ARGBASE 16
#define FP  12

csum_partial_copy_generic:
subl $4,%esp
pushl %edi
pushl %esi
pushl %ebx
movl ARGBASE+16(%esp),%eax # sum
movl ARGBASE+12(%esp),%ecx # len
movl ARGBASE+4(%esp),%esi # src
movl ARGBASE+8(%esp),%edi # dst

testl $2, %edi   # Check alignment.
jz 2f    # Jump if alignment is ok. 4字节对齐
subl $2, %ecx   # Alignment uses up two bytes.
jae 1f    # Jump if we had at least two bytes. len >=2
addl $2, %ecx   # ecx was < 2. Deal with it.
jmp 4f
SRC(1: movw (%esi), %bx )
addl $2, %esi
DST( movw %bx, (%edi) )
addl $2, %edi
addw %bx, %ax
adcl $0, %eax
2:
movl %ecx, FP(%esp) #由前面subl $4,%esp分配空间,保存需要处理的长度
shrl $5, %ecx #一次处理32字节
jz 2f #处理完否?是否足够32个字节
testl %esi, %esi #清进位标志

SRC(1: movl (%esi), %ebx )
SRC( movl 4(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, (%edi) )
adcl %edx, %eax
DST( movl %edx, 4(%edi) )

SRC( movl 8(%esi), %ebx )
SRC( movl 12(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, 8(%edi) )
adcl %edx, %eax
DST( movl %edx, 12(%edi) )

SRC( movl 16(%esi), %ebx )
SRC( movl 20(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, 16(%edi) )
adcl %edx, %eax
DST( movl %edx, 20(%edi) )

SRC( movl 24(%esi), %ebx )
SRC( movl 28(%esi), %edx )
adcl %ebx, %eax
DST( movl %ebx, 24(%edi) )
adcl %edx, %eax
DST( movl %edx, 28(%edi) )

lea 32(%esi), %esi
lea 32(%edi), %edi
dec %ecx
jne 1b #为处理完
adcl $0, %eax
2: movl FP(%esp), %edx #还原len
movl %edx, %ecx
andl $0x1c, %edx #0001 1100
je 4f #是否有[4,31]之间这一段要处理
shrl $2, %edx # This clears CF,转换成4字节个数
SRC(3: movl (%esi), %ebx )
adcl %ebx, %eax
DST( movl %ebx, (%edi) )
lea 4(%esi), %esi
lea 4(%edi), %edi
dec %edx
jne 3b
adcl $0, %eax
4: andl $3, %ecx
jz 7f #是否有[1,3]这一段要处理
cmpl $2, %ecx
jb 5f #还剩一字节?
SRC( movw (%esi), %cx )
leal 2(%esi), %esi
DST( movw %cx, (%edi) )
leal 2(%edi), %edi
je 6f
shll $16,%ecx #放到高字节处
SRC(5: movb (%esi), %cl )
DST( movb %cl, (%edi) )
6: addl %ecx, %eax #不影响最后结果
adcl $0, %eax
7:
5000:

# Exception handler:
.section .fixup, "ax"

6001:
movl ARGBASE+20(%esp), %ebx # src_err_ptr
movl $-EFAULT, (%ebx) #源内存出错

# zero the complete destination - computing the rest
# is too much work #目的全部清0
movl ARGBASE+8(%esp), %edi # dst
movl ARGBASE+12(%esp), %ecx # len
xorl %eax,%eax

//用户到内核:保证内核存在
rep ; stosb #不出错???????目的地是内核，源是用户空间???

jmp 5000b

6002:
movl ARGBASE+24(%esp), %ebx # dst_err_ptr
movl $-EFAULT,(%ebx) #目的内存出错

//内核到用户:保证内核存在
jmp 5000b

.previous

popl %ebx
popl %esi
popl %edi
popl %ecx # equivalent to addl $4,%esp
ret

#else

/* Version for PentiumII/PPro */

#define ROUND1(x) \
SRC(movl x(%esi), %ebx ) ; \
addl %ebx, %eax ; \
DST(movl %ebx, x(%edi) ) ;

#define ROUND(x) \
SRC(movl x(%esi), %ebx ) ; \
adcl %ebx, %eax ; \
DST(movl %ebx, x(%edi) ) ;

#define ARGBASE 12

csum_partial_copy_generic:
pushl %ebx
pushl %edi
pushl %esi
movl ARGBASE+4(%esp),%esi #src
movl ARGBASE+8(%esp),%edi #dst
movl ARGBASE+12(%esp),%ecx #len
movl ARGBASE+16(%esp),%eax #sum
# movl %ecx, %edx
movl %ecx, %ebx #len
movl %esi, %edx #src
shrl $6, %ecx   #一次处理64字节???
andl $0x3c, %ebx #还剩下的长度是否在[4,63] 中
negl %ebx #没有右移
subl %ebx, %esi
subl %ebx, %edi
lea -1(%esi),%edx #%esi-1
andl $-32,%edx #FFFFFFE0 1111 1111 1111 1111 1111 1111 1110 0000,向下对齐到32字节边界
lea 3f(%ebx,%ebx), %ebx #%ebx+%ebx*1+3f (src代码+dst代码)
testl %esi, %esi #clear CF
jmp *%ebx
1: addl $64,%esi
addl $64,%edi
SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) #使数据进入cache line??
//movb -32(%edx),%bl) 取前32个字节缓冲行
//SRC(movb (%edx),%bl) 取后32个字节缓冲行
/*
8b 5e c0                mov    0xffffffc0(%esi),%ebx
01 d8                   add    %ebx,%eax
89 5f c0                mov    %ebx,0xffffffc0(%edi)

8b 5e c4                mov    0xffffffc4(%esi),%ebx
11 d8                   adc    %ebx,%eax
89 5f c4                mov    %ebx,0xffffffc4(%edi)
*/
ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
3: adcl $0,%eax
addl $64, %edx
dec %ecx
jge 1b
4: movl ARGBASE+12(%esp),%edx #len
andl $3, %edx
jz 7f #剩下的长度为0
cmpl $2, %edx
jb 5f #剩下的长度为1
SRC( movw (%esi), %dx         )
leal 2(%esi), %esi
DST( movw %dx, (%edi)         )
leal 2(%edi), %edi
je 6f
shll $16,%edx
5:
SRC( movb (%esi), %dl         )
DST( movb %dl, (%edi)         )
6: addl %edx, %eax
adcl $0, %eax
7:
.section .fixup, "ax"
6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
movl $-EFAULT, (%ebx)
# zero the complete destination (computing the rest is too much work)
movl ARGBASE+8(%esp),%edi # dst
movl ARGBASE+12(%esp),%ecx # len
xorl %eax,%eax
rep; stosb
jmp 7b
6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
movl $-EFAULT, (%ebx)
jmp 7b
.previous

popl %esi
popl %edi
popl %ebx
ret

#undef ROUND
#undef ROUND1

#endif

//计算伪首部检验和

static inline unsigned short int csum_tcpudp_magic(unsigned long saddr,
         unsigned long daddr,
         unsigned short len,
         unsigned short proto,
         unsigned int sum)
{
return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
}

static inline unsigned long csum_tcpudp_nofold(unsigned long saddr,
         unsigned long daddr,
         unsigned short len,
         unsigned short proto,
         unsigned int sum)
{
    __asm__("
addl %1, %0
adcl %2, %0
adcl %3, %0
adcl $0, %0
"
: "=r" (sum)
: "g" (daddr), "g"(saddr), "g"((ntohs(len)<<16)+proto*256), "0"(sum));
    return sum;
}

static inline unsigned int csum_fold(unsigned int sum)
{
__asm__("
  addl %1, %0 #对折到寄存器高16位并相加,可能置进位
  adcl $0xffff, %0 #如果有进位，则相当于1+0xffff+%0=0x10000+%0->即将%0的高16位+1,否则0xffff+%0,%0的低16位为0，对%0的高16位无影响
  "
  : "=r" (sum)
  : "r" (sum << 16), #将sum的低16位移到寄存器1的高16位
   "0" (sum & 0xffff0000) #sum的高16位在寄存器0的高16位,寄存器0的低16位为0
);
return (~sum) >> 16;//取反，将高16位移到低16位中
}

2.2 udp接收检验和计算

int udp_rcv(struct sk_buff *skb, unsigned short len)
{

if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0)//先初始化检验和
goto csum_error;

sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);//查询套接字

if (sk != NULL) {
  udp_queue_rcv_skb(sk, skb);
  sock_put(sk);
  return 0;
}
}

static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
{
/*
* Charge it to the socket, dropping if the queue is full.
*/

#if defined(CONFIG_FILTER)
if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {//需要计算
  if (__udp_checksum_complete(skb)) {//完成最后的计算
   UDP_INC_STATS_BH(UdpInErrors);
   IP_INC_STATS_BH(IpInDiscards);
   ip_statistics[smp_processor_id()*2].IpInDelivers--;
   kfree_skb(skb);
   return -1;
  }
  skb->ip_summed = CHECKSUM_UNNECESSARY;
}
#endif
}

static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
        unsigned short ulen, u32 saddr, u32 daddr)
{
if (uh->check == 0) {//没有检验和
  skb->ip_summed = CHECKSUM_UNNECESSARY;//不必计算
} else if (skb->ip_summed == CHECKSUM_HW) {//硬件检验过
//加上伪首部检验和
  if (udp_check(uh, ulen, saddr, daddr, skb->csum))
   return -1;
  skb->ip_summed = CHECKSUM_UNNECESSARY;//已完成计算
} else if (skb->ip_summed != CHECKSUM_UNNECESSARY)//需要检验
//先计算伪首部检验和
  skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
/* Probably, we should checksum udp header (it should be in cache
* in any case) and data in tiny packets (< rx copybreak).
*/
return 0;
}

static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
{
return (unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum));
}

3 tcp的检验和计算

3.1 tcp发送检验和计算

int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
//检验和
tp->af_specific->send_check(sk, th, skb->len, skb);//tcp_v4_send_check
}

void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
struct sk_buff *skb)
{
th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
csum_partial((char *)th, th->doff<<2, skb->csum));
}

static __inline__ u16 tcp_v4_check(struct tcphdr *th, int len,
unsigned long saddr, unsigned long daddr,
unsigned long base)
{
return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);//加上伪首部
}

3.2 tcp接收检验和计算

int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
{
if (th->doff < sizeof(struct tcphdr)/4 ||
     (skb->ip_summed != CHECKSUM_UNNECESSARY &&
      tcp_v4_checksum_init(skb) < 0))
  goto bad_packet;

sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));

if (!sk->lock.users) {
  if (!tcp_prequeue(sk, skb))//为加速数据传输，使用的prequeue技术
  //tcp_rcvmsg和tcp_rcv_established配合加速传输
   ret = tcp_v4_do_rcv(sk, skb);
} else
  sk_add_backlog(sk, skb);//加到backlog中,其它函数在release_sock时处理

}

int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
if (sk->state == TCP_ESTABLISHED) { /* Fast path */
  TCP_CHECK_TIMER(sk);
  if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
   goto reset;
  TCP_CHECK_TIMER(sk);
  return 0;
}
}

int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
   struct tcphdr *th, unsigned len)
{
    if (tcp_checksum_complete_user(sk, skb))
     goto csum_error;
}

static int tcp_v4_checksum_init(struct sk_buff *skb)
{
if (skb->ip_summed == CHECKSUM_HW) {
  if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
     skb->nh.iph->daddr,skb->csum)) {
   NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
   return -1;
  }
  skb->ip_summed = CHECKSUM_UNNECESSARY;
} else {
  if (skb->len <= 76) {//包很小，全部计算
   if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
      skb->nh.iph->daddr,
      csum_partial((char *)skb->h.th, skb->len, 0)))
    return -1;
   skb->ip_summed = CHECKSUM_UNNECESSARY;
  } else {//只计算伪首部,后面再计算剩下的
   skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
        skb->nh.iph->daddr,0);
  }
}
return 0;
}

static __inline__ int
tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
{
return skb->ip_summed != CHECKSUM_UNNECESSARY &&//计算剩下的
__tcp_checksum_complete_user(sk, skb);
}

static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
{
int result;

if (sk->lock.users) {
  local_bh_enable();
  result = __tcp_checksum_complete(skb);
  local_bh_disable();
} else {
  result = __tcp_checksum_complete(skb);
}
return result;
}

static __inline__ int __tcp_checksum_complete(struct sk_buff *skb)
{
return (unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum));
}

阅读(2151) | 评论(1) | 转发(0) |

上一篇：网卡驱动加载脚本的分析

下一篇：内核对浮点处理器的使用机制

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6