Chinaunix首页 | 论坛 | 博客
  • 博客访问: 69382
  • 博文数量: 33
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 11
  • 用 户 组: 普通用户
  • 注册时间: 2014-06-10 16:37
文章分类
文章存档

2016年(6)

2015年(23)

2014年(4)

我的朋友

分类: LINUX

2014-08-29 10:56:03

原文地址:linux网络模块的发展历程 作者:zoleo

2010/6/18:还在分析linux/net代码中,觉得在2.6的内核用来分析代码流程的确很麻烦。原理的东西一般都很好理解,有的技术PPT可能几页就能讲的很清楚了。比如GRE,理论上就是圆环套圆环(记得好像是那部电影??),可是分析2.6的代码就麻烦的很。还要理解tunnel。
所以突然想到,跟踪linux/net功能的发展历程来分析,应该还不错。
 
于是,先下载了0.96c的代码,发现这个版本的代码还只支持本地socket(AF_UNIX),还不算真正意义上的net。
找来找去,搞了一个0.99.15。还算完善些。
这个版本支持INET,也就是IPV4。所以有值得分析的部分。
先看看吧,有了心得在写下来。希望借这个版本能够把路由策略(route.c)和TCP协议(tcp.c)很好的掌握以下。
 
1. 路由策略
在ip报文的发送和转发过程中,都涉及到路由的问题。在ip_build_header和ip_forward中,都调用了rt_route()。rt_route()这个函数根据传入的目的ip,返回一个rtable结构。这个结构就是我们经常看到的路由表,route这个命令能够查看到。

struct rtable {
  struct rtable *rt_next;//路由表项是一个单项的链表,没有hash,完全是最简单的实现
  unsigned long rt_dst; //目的地址
  unsigned long rt_mask; //目的掩码
  unsigned long rt_gateway;//目的网关
  unsigned char rt_flags; //表示下一跳是个什么东西
  unsigned char rt_metric;//可以理解为cost
  short rt_refcnt;//引用计数
  unsigned long rt_use;//命中次数?
  unsigned short rt_mss, rt_mtu;//
  struct device *rt_dev;//对应的设备,从哪个接口出去
};

在看一下rt_route的实现,简直是爽死了。就这么几行就搞定了2.6内核那一坨东西,不过功能肯定是不及:)
 

struct rtable * rt_route(unsigned long daddr, struct options *opt)
{
    struct rtable *rt;
    //遍历路由表
    for (rt = rt_base; rt != NULL || early_out ; rt = rt->rt_next) {

        //如果完全匹配或同一个子网,则选择这条路由。原来子网的用途在这里
        if (!((rt->rt_dst ^ daddr) & rt->rt_mask))
            break;
        /* broadcast addresses can be special cases.. */

        //如果你是一个广播地址,并且该路由表项允许广播,也就是这个路由表项制定的物理接口支持广播,表示命中这个表项。但有一个问题就是如果有多个接口,只能从第一个表项指定的接口进行广播了,因为break了。
        if ((rt->rt_dev->flags & IFF_BROADCAST) &&
         rt->rt_dev->pa_brdaddr == daddr)
            break;
    }

    //如果是loopback,但没有loopback路由?奇怪,可能是不支持loopback接口吧
    if (daddr == rt->rt_dev->pa_addr) {
        if ((rt = rt_loopback) == NULL)
            goto no_route;
    }
    rt->rt_use++;//命中次数
    return rt;
no_route:
    return NULL;
}

由此看来,早期内核对网络的支持非常简单,用来分析网络的实现也很容易。在看看路由表项是怎么添加进去的。rt_add()负责向路由表项添加路由表,而调用rt_add的地方有rt_ioctlicmp.c中,icmp的路由重定向的支持。

也就是说,添加路由表项有两种方式:

a) inet_ioctl/rt_ioctl用来添加删除。

b) icmp协议的重定向报文。

在rt_add中,指定类型(flags),目的地址,掩码,目的网关地址,对应的物理接口等参数。
如此,只要花费半小时,就分析完了route.c,效率高啊:)

 
2. TCP协议
linux内核在简单,在TCP协议的实现上也简单不了,所以看一下tcp.c,所以linux的tcp实现参考了BSD Socket,这在文件头中有说明。
先看一下tcp_prot结构
 

struct proto tcp_prot = {
  sock_wmalloc
  sock_rmalloc
  sock_wfree,
  sock_rfree,
  sock_rspace,
  sock_wspace,
  tcp_close,
  tcp_read,
  tcp_write,
  tcp_sendto,
  tcp_recvfrom,
  ip_build_header,
  tcp_connect,
  tcp_accept,
  ip_queue_xmit,
  tcp_retransmit,
  tcp_write_wakeup,
  tcp_read_wakeup,
  tcp_rcv,
  tcp_select,
  tcp_ioctl,
  NULL,
  tcp_shutdown,
  tcp_setsockopt,
  tcp_getsockopt,
  128,
  0,
  {NULL,},
  "TCP"
};

2010/6/21:稍微考虑了一下,一个完整的C/S通信过程,最好先看Server端的处理流程,在分析Client端的处理。

Server端流程一般都是这样的情况

a)socket

b)bind

c)listen

d)accept //这个过程中一般会fork一个进程处理新建立的链接。

e)send/recv

f)close

好,先看一下socket()都做了哪些事情,sock_register函数注册了inet相关操作,对应函数inet_create()。代码就不贴了,这个函数实现比较简单,就是为sock结构分配内存并初始化。
大概过程是socket(user space)->sys_socketcall(kernel space)->sock_socket()->inet_create(前提是指定inet通信)。
这里在sock_socket中为socket结构分配内存和属于进程的fd,在inet_create中为sock分配内存。
 
接下来看一下inet_bind
 

static int
inet_bind(struct socket *sock, struct sockaddr *uaddr,
     int addr_len)
{
  struct sockaddr_in addr;
  struct sock *sk, *sk2;
  unsigned short snum;
  int err;

  sk = (struct sock *) sock->data;
  if (sk == NULL) {
    printk("Warning: sock->data = NULL: %d\n" ,__LINE__);
    return(0);
  }

  /* check this error. */
  if (sk->state != TCP_CLOSE) return(-EIO);
  if (sk->num != 0) return(-EINVAL);

  err=verify_area(VERIFY_READ, uaddr, addr_len);
  if(err)
      return err;
  memcpy_fromfs(&addr, uaddr, min(sizeof(addr), addr_len));

  snum = ntohs(addr.sin_port);
  DPRINTF((DBG_INET, "bind sk =%X to port = %d\n", sk, snum));
  sk = (struct sock *) sock->data;

  /*
   * We can't just leave the socket bound wherever it is, it might
   * be bound to a privileged port. However, since there seems to
   * be a bug here, we will leave it if the port is not privileged.
   */

  if (snum == 0) { //如果没有指定bind的端口号,系统为你分配一个
    snum = get_new_socknum(sk->prot, 0);
  }
  if (snum < PROT_SOCK && !suser()) return(-EACCES);

  if (addr.sin_addr.s_addr!=0 && chk_addr(addr.sin_addr.s_addr)!=IS_MYADDR)
      return(-EADDRNOTAVAIL);    /* Source address MUST be ours! */
      
  if (chk_addr(addr.sin_addr.s_addr) || addr.sin_addr.s_addr == 0)
                    sk->saddr = addr.sin_addr.s_addr;

  DPRINTF((DBG_INET, "sock_array[%d] = %X:\n", snum &(SOCK_ARRAY_SIZE -1),
             sk->prot->sock_array[snum &(SOCK_ARRAY_SIZE -1)]));

  /* Make sure we are allowed to bind here. */
  cli();
outside_loop:

//这里以源端口号做了一个散列,在bind操作,找到符合要求的sk
  for(sk2 = sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)];
                    sk2 != NULL; sk2 = sk2->next) {
#if     1    /* should be below! */
    if (sk2->num != snum) continue;
/*    if (sk2->saddr != sk->saddr) continue; */
#endif
    if (sk2->dead) {
        destroy_sock(sk2);
        goto outside_loop;
    }
    if (!sk->reuse) {
        sti();
        return(-EADDRINUSE);
    }
    if (sk2->num != snum) continue;        /* more than one */
    if (sk2->saddr != sk->saddr) continue;    /* socket per slot ! -FB */
    if (!sk2->reuse) {
        sti();
        return(-EADDRINUSE);
    }
  }
  sti();

  remove_sock(sk);
  put_sock(snum, sk);
  sk->dummy_th.source = ntohs(sk->num);
  sk->daddr = 0;
  sk->dummy_th.dest = 0;
  return(0);
}


再看listen

static int
inet_listen(struct socket *sock, int backlog)
{
  struct sock *sk;

  sk = (struct sock *) sock->data;
  if (sk == NULL) {
    printk("Warning: sock->data = NULL: %d\n" ,__LINE__);
    return(0);
  }

  /* We may need to bind the socket. */
  if (sk->num == 0) {
    sk->num = get_new_socknum(sk->prot, 0);
    if (sk->num == 0) return(-EAGAIN);
    put_sock(sk->num, sk);
    sk->dummy_th.source = ntohs(sk->num);
  }

  /* We might as well re use these. */
  sk->max_ack_backlog = backlog;//比较感兴趣的就是这里和下面的sk->state, backlog这个值如果不分析socket实现的话,是不会理解这个含义的。先说一下,这个数值的功能是能够同时接受多少个sync报文,也算是为了防止sync攻击的初级防御吧,回想一下tcp的三次握手,先是client端发送sync报文,如果server端在接收这个报文后,还没有创建新的连接,那么最多可以缓存backlog个sync报文,多出的部分直接扔掉
  if (sk->state != TCP_LISTEN) {
    sk->ack_backlog = 0;
    sk->state = TCP_LISTEN;//当设置这个值为listen的时候,对应端口的tcp报文就可以被内核处理了
  }
  return(0);
}


现在可以看一下tcp_recv
 

    case TCP_LISTEN: //如果是listen状态才处理sync报文
        if (th->rst) {//这时收到rst报文,直接丢弃,不处理
            kfree_skb(skb, FREE_READ);
            release_sock(sk);
            return(0);
        }
        if (th->ack) {//如果收到ack报文,说明可能server端down过,通知对方链接已经被重置
            tcp_reset(daddr, saddr, th, sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
            kfree_skb(skb, FREE_READ);
            release_sock(sk);
            return(0);
        }

        if (th->syn) {//这个才是正确的报文
#if 0
            if (opt->security != 0 || opt->compartment != 0) {
                tcp_reset(daddr, saddr, th, prot, opt,dev);
                release_sock(sk);
                return(0);
            }
#endif

            /*
             * Now we just put the whole thing including
             * the header and saddr, and protocol pointer
             * into the buffer. We can't respond until the
             * user tells us to accept the connection.
             */

            tcp_conn_request(sk, skb, daddr, saddr, opt, dev);//这个函数处理sync报文,下面分析
            release_sock(sk);
            return(0);
        }

        kfree_skb(skb, FREE_READ);
        release_sock(sk);
        return(0);

 default://缺省的报文丢弃
  if (!tcp_sequence(sk, th, len, opt, saddr,dev)) {
   kfree_skb(skb, FREE_READ);
   release_sock(sk);
   return(0);
  }


static void
tcp_conn_request(struct sock *sk, struct sk_buff *skb,
         unsigned long daddr, unsigned long saddr,
         struct options *opt, struct device *dev)
{
  struct sk_buff *buff;
  struct tcphdr *t1;
  unsigned char *ptr;
  struct sock *newsk;
  struct tcphdr *th;
  int tmp;

  DPRINTF((DBG_TCP, "tcp_conn_request(sk = %X, skb = %X, daddr = %X, sadd4= %X, \n"
     " opt = %X, dev = %X)\n",
     sk, skb, daddr, saddr, opt, dev));
  
  th = skb->h.th;

  /* If the socket is dead, don't accept the connection. */
  if (!sk->dead) {
      sk->data_ready(sk,0);
  } else {
    DPRINTF((DBG_TCP, "tcp_conn_request on dead socket\n"));
    tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
    kfree_skb(skb, FREE_READ);
    return;
  }

  /*
   * Make sure we can accept more. This will prevent a
   * flurry of syns from eating up all our memory.
   */

  if (sk->ack_backlog >= sk->max_ack_backlog) {//这里,达到max,丢弃报文
    kfree_skb(skb, FREE_READ);
    return;
  }

  /*
   * We need to build a new sock struct.
   * It is sort of bad to have a socket without an inode attached
   * to it, but the wake_up's will just wake up the listening socket,
   * and if the listening socket is destroyed before this is taken
   * off of the queue, this will take care of it.
   */

  //复制一份新的sock
  newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
  if (newsk == NULL) {
    /* just ignore the syn. It will get retransmitted. */
    kfree_skb(skb, FREE_READ);
    return;
  }

  DPRINTF((DBG_TCP, "newsk = %X\n", newsk));
  memcpy((void *)newsk,(void *)sk, sizeof(*newsk));
  newsk->wback = NULL;
  newsk->wfront = NULL;
  newsk->rqueue = NULL;
  newsk->send_head = NULL;
  newsk->send_tail = NULL;
  newsk->back_log = NULL;
  newsk->rtt = TCP_CONNECT_TIME << 3;
  newsk->rto = TCP_CONNECT_TIME;
  newsk->mdev = 0;
  newsk->max_window = 0;
  newsk->cong_window = 1;
  newsk->cong_count = 0;
  newsk->ssthresh = 0;
  newsk->backoff = 0;
  newsk->blog = 0;
  newsk->intr = 0;
  newsk->proc = 0;
  newsk->done = 0;
  newsk->partial = NULL;
  newsk->pair = NULL;
  newsk->wmem_alloc = 0;
  newsk->rmem_alloc = 0;

  newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;

  newsk->err = 0;
  newsk->shutdown = 0;
  newsk->ack_backlog = 0;
  newsk->acked_seq = skb->h.th->seq+1;
  newsk->fin_seq = skb->h.th->seq;
  newsk->copied_seq = skb->h.th->seq;
  newsk->state = TCP_SYN_RECV;//设置状态,注意这个是newsk,原来的sk状态仍为LISTEN,这样在下一个tcp_rcv的处理中,会做进一步的处理
  newsk->timeout = 0;
  newsk->send_seq = jiffies * SEQ_TICK - seq_offset;//随机生成序列号算法
  newsk->window_seq = newsk->send_seq;
  newsk->rcv_ack_seq = newsk->send_seq;
  newsk->urg =0;
  newsk->retransmits = 0;
  newsk->destroy = 0;
  newsk->timer.data = (unsigned long)newsk;
  newsk->timer.function = &net_timer;//协议用到的timer
  newsk->dummy_th.source = skb->h.th->dest;
  newsk->dummy_th.dest = skb->h.th->source;

  /* Swap these two, they are from our point of view. */
  newsk->daddr = saddr;
  newsk->saddr = daddr;

  put_sock(newsk->num,newsk);
  newsk->dummy_th.res1 = 0;
  newsk->dummy_th.doff = 6;
  newsk->dummy_th.fin = 0;
  newsk->dummy_th.syn = 0;
  newsk->dummy_th.rst = 0;
  newsk->dummy_th.psh = 0;
  newsk->dummy_th.ack = 0;
  newsk->dummy_th.urg = 0;
  newsk->dummy_th.res2 = 0;
  newsk->acked_seq = skb->h.th->seq + 1;
  newsk->copied_seq = skb->h.th->seq;

  /* Grab the ttl and tos values and use them */
  newsk->ip_ttl=sk->ip_ttl;
  newsk->ip_tos=skb->ip_hdr->tos;

/* use 512 or whatever user asked for */
/* note use of sk->user_mss, since user has no direct access to newsk */
  if (sk->user_mss)
    newsk->mtu = sk->user_mss;
  else {
#ifdef SUBNETSARELOCAL
    if ((saddr ^ daddr) & default_mask(saddr))
#else
    if ((saddr ^ daddr) & dev->pa_mask)
#endif
      newsk->mtu = 576 - HEADER_SIZE;
    else
      newsk->mtu = MAX_WINDOW;
  }
/* but not bigger than device MTU */
  newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);

/* this will min with what arrived in the packet */
  tcp_options(newsk,skb->h.th);
  //准备发送sync&ack的报文
  buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
  if (buff == NULL) {
    sk->err = -ENOMEM;
    newsk->dead = 1;
    release_sock(newsk);
    kfree_skb(skb, FREE_READ);
    return;
  }
  
  buff->mem_addr = buff;
  buff->mem_len = MAX_SYN_SIZE;
  buff->len = sizeof(struct tcphdr)+4;
  buff->sk = newsk;
  
  t1 =(struct tcphdr *) buff->data;

  /* Put in the IP header and routing stuff. */
  tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &dev,
             IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);

  /* Something went wrong. */
  if (tmp < 0) {
    sk->err = tmp;
    buff->free=1;
    kfree_skb(buff,FREE_WRITE);
    newsk->dead = 1;
    release_sock(newsk);
    skb->sk = sk;
    kfree_skb(skb, FREE_READ);
    return;
  }

  buff->len += tmp;
  t1 =(struct tcphdr *)((char *)t1 +tmp);
  
  memcpy(t1, skb->h.th, sizeof(*t1));
  buff->h.seq = newsk->send_seq;

  /* Swap the send and the receive. */
  t1->dest = skb->h.th->source;
  t1->source = newsk->dummy_th.source;
  t1->seq = ntohl(newsk->send_seq++);
  t1->ack = 1;//设置ACK标志
  newsk->window = tcp_select_window(newsk);/*newsk->prot->rspace(newsk);*/
  t1->window = ntohs(newsk->window);
  t1->res1 = 0;
  t1->res2 = 0;
  t1->rst = 0;
  t1->urg = 0;
  t1->psh = 0;
  t1->syn = 1;//设置SYNC标志
  t1->ack_seq = ntohl(skb->h.th->seq+1);
  t1->doff = sizeof(*t1)/4+1;

  ptr =(unsigned char *)(t1+1);
  ptr[0] = 2;
  ptr[1] = 4;
  ptr[2] = ((newsk->mtu) >> 8) & 0xff;
  ptr[3] =(newsk->mtu) & 0xff;

  tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
  newsk->prot->queue_xmit(newsk, dev, buff, 0);//在这里发送SYNC/ACK报文

  reset_timer(newsk, TIME_WRITE /* -1 ? FIXME ??? */, TCP_CONNECT_TIME);
  skb->sk = newsk;

  /* Charge the sock_buff to newsk. */
  sk->rmem_alloc -= skb->mem_len;
  newsk->rmem_alloc += skb->mem_len;

  skb_queue_tail(&sk->rqueue,skb);//同时把接收的这个报文上加入到sock接收队列中
  sk->ack_backlog++;
  release_sock(newsk);
}

这样,一个tcp握手就完成了,下面开始用户调用accept了

accept调用关系sock_accept->inet_accept->tcp_accept,先看一下tcp_accept

 

/* This will accept the next outstanding connection. */
static struct sock *
tcp_accept(struct sock *sk, int flags)
{
  struct sock *newsk;
  struct sk_buff *skb;
  
  DPRINTF((DBG_TCP, "tcp_accept(sk=%X, flags=%X, addr=%s)\n",
                sk, flags, in_ntoa(sk->saddr)));

  /*
   * We need to make sure that this socket is listening,
   * and that it has something pending.
   */

  if (sk->state != TCP_LISTEN) {
    sk->err = EINVAL;
    return(NULL);
  }

  /* avoid the race. */
  cli();
  sk->inuse = 1;

  //等待接收的报文,当有报文时,就传给inet_accept,这个报文就是sync报文了,刚刚在tcp_recv函数中已经分析过了,同时state已经变为TCP_SYN_RECV了,在看inet_accept实现
  while((skb = get_firstr(sk)) == NULL) {
    if (flags & O_NONBLOCK) {
        sti();
        release_sock(sk);
        sk->err = EAGAIN;
        return(NULL);
    }

    release_sock(sk);
    interruptible_sleep_on(sk->sleep);
    if (current->signal & ~current->blocked) {
        sti();
        sk->err = ERESTARTSYS;
        return(NULL);
    }
    sk->inuse = 1;
  }
  sti();

  /* Now all we need to do is return skb->sk. */
  newsk = skb->sk;

  kfree_skb(skb, FREE_READ);
  sk->ack_backlog--;
  release_sock(sk);
  return(newsk);
}


static int
inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
  struct sock *sk1, *sk2;
  int err;

  sk1 = (struct sock *) sock->data;
  if (sk1 == NULL) {
    printk("Warning: sock->data = NULL: %d\n" ,__LINE__);
    return(0);
  }

  /*
   * We've been passed an extra socket.
   * We need to free it up because the tcp module creates
   * it's own when it accepts one.
   */

  if (newsock->data) kfree_s(newsock->data, sizeof(struct sock));
  newsock->data = NULL;

  if (sk1->prot->accept == NULL) return(-EOPNOTSUPP);

  /* Restore the state if we have been interrupted, and then returned. */
  if (sk1->pair != NULL ) {
    sk2 = sk1->pair;
    sk1->pair = NULL;
  } else {
    sk2 = sk1->prot->accept(sk1,flags);
    if (sk2 == NULL) {
        if (sk1->err <= 0)
            printk("Warning sock.c:sk1->err <= 0. Returning non-error.\n");
        err=sk1->err;
        sk1->err=0;
        return(-err);
    }
  }
  newsock->data = (void *)sk2;
  sk2->sleep = newsock->wait;
  newsock->conn = NULL;
  if (flags & O_NONBLOCK) return(0);

  cli(); /* avoid the race. */
  while(sk2->state == TCP_SYN_RECV) {//由于这个条件肯定是成立的,所以进入等待状态
    interruptible_sleep_on(sk2->sleep);
    if (current->signal & ~current->blocked) {
        sti();
        sk1->pair = sk2;
        sk2->sleep = NULL;
        newsock->data = NULL;
        return(-ERESTARTSYS);
    }
  }
  sti();
  //在tcp_rcv中,如果是TCP_SYN_RECV,在接收到ACK后,状态变为TCP_ESTABLISHED
  if (sk2->state != TCP_ESTABLISHED && sk2->err > 0) {

    err = -sk2->err;
    sk2->err=0;
    destroy_sock(sk2);
    newsock->data = NULL;
    return(err);
  }
  newsock->state = SS_CONNECTED;//新的链接状态为SS_CONNECTED,这样一个TCP链接就建立完成了,可以进行SEND/RECV了。
  return(0);
}


阅读(504) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~