网络协议栈实现分析<3>–BSD层实现分析-hacktao-ChinaUnix博客

Chinaunix首页 | 论坛 | 博客

首页　| 　博文目录　| 　关于我

博客访问： 51932
博文数量： 13
博客积分： 1496
博客等级：上尉
技术积分： 140
用户组：普通用户
注册时间： 2009-04-10 00:11

文章分类

全部博文（13）

网络协议栈（4）
网络编程（2）
QT（3）
有感而发（2）
hacking（0）
操作系统（0）
linux 嵌入式（0）
未分配的博文（2）

文章存档

2011年（1）

2010年（5）

2009年（7）

我的朋友

T-Bagwel

最近访客

推荐博文

相关博文

网络协议栈实现分析<3>–BSD层实现分析

分类： LINUX

2010-03-18 19:24:23

        BSD层源码分析
写于2010.1.20

BSD层主要源码有:

net/protocols.c          链路层协议初始化函数及域初始化函数定义

net/socket.c             BSD socket 层实现文件

include/linux/net.h

对于BSD 层来说，socket.c 是这节实现的重点。故我们针对这个文件按照我们分析流程把对应BSD 层其他文件一起进行分析。

BSD层涉及到的结构体：

/*

* Internal representation of a socket. not all the fields are used by

* all configurations:

*

* server client

* conn client connected to server connected to

* iconn list of clients -unused-

* awaiting connections

* wait sleep for clients, sleep for connection,

* sleep for i/o sleep for i/o

/

include/linux/net.h

struct socket {

  short type; / SOCK_STREAM, … /

  socket_state state;

  long flags;

  struct proto_ops ops; /* protocols do most everything /

  void data; /* protocol data /

  struct socket conn; /* server socket connected to /

  struct socket iconn; /* incomplete client conn.s /

  struct socket next;

  struct wait_queue **wait; /* ptr to place to wait on /

  struct inode inode;

  struct fasync_struct  fasync_list; / Asynchronous wake up list /

};

//对socket操作的函数指针集合

struct proto_ops {

  int family;

  int (create) (struct socket sock, int protocol);

  int (dup) (struct socket newsock, struct socket oldsock);

  int (release) (struct socket sock, struct socket peer);

  int (bind) (struct socket sock, struct sockaddr umyaddr,

int sockaddr_len);

  int (connect) (struct socket sock, struct sockaddr uservaddr,

int sockaddr_len, int flags);

  int (socketpair) (struct socket sock1, struct socket sock2);

  int (accept) (struct socket sock, struct socket newsock,

int flags);

  int (getname) (struct socket sock, struct sockaddr uaddr,

int usockaddr_len, int peer);

  int (read) (struct socket sock, char ubuf, int size,

int nonblock);

  int (write) (struct socket sock, char ubuf, int size,

int nonblock);

  int (select) (struct socket sock, int sel_type,

select_table wait);

  int (ioctl) (struct socket sock, unsigned int cmd,

unsigned long arg);

  int (listen) (struct socket sock, int len);

  int (send) (struct socket sock, void buff, int len, int nonblock,

unsigned flags);

  int (recv) (struct socket sock, void buff, int len, int nonblock,

unsigned flags);

  int (sendto) (struct socket sock, void buff, int len, int nonblock,

unsigned flags, struct sockaddr , int addr_len);

  int (recvfrom) (struct socket sock, void buff, int len, int nonblock,

unsigned flags, struct sockaddr , int addr_len);

  int (shutdown) (struct socket sock, int flags);

  int (setsockopt) (struct socket sock, int level, int optname,

char optval, int optlen);

  int (getsockopt) (struct socket sock, int level, int optname,

char optval, int optlen);

  int (fcntl) (struct socket sock, unsigned int cmd,

unsigned long arg);

};

//这个是网络协议名字以及协议操作的集合。

struct net_proto {

char name; / Protocol name /

void (init_func)(struct net_proto ); / Bootstrap /

};

文件名称： socket.c

系统调用INT $0×80 进入内核来执行函数，该函数根据AX寄存器中的系统调用号进一步调用内核网络栈相应的实现函数。对于socket，bind等这些函数，socket.c文件只作第一层的实现函数（sock_socket,sock_bind）。

  虽然linux 中几乎所有的接口都是以文件形式来组织的，但对于网络栈在/dev（现在的linux内核已经有这样的文件了）目录下却无这样的对应的关系。不过内核还是提。供了对于网络数据的普通文件操作方式，如write，read函数可直接用于读写网络数据，在socket.c文件中可以看到针对网络数据的文件操作函数集合的实现。

下面我们一段一段来分析socket.c：

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

//这里是对文件的操作的声明

static int sock_lseek(struct inode inode, struct file file, off_t offset,

      int whence);

static int sock_read(struct inode inode, struct file file, char buf,

     int size);

static int sock_write(struct inode inode, struct file file, char buf,

      int size);

static int sock_readdir(struct inode inode, struct file file,

struct dirent dirent, int count);

static void sock_close(struct inode inode, struct file file);

static int sock_select(struct inode inode, struct file file, int which, select_table seltable);

static int sock_ioctl(struct inode inode, struct file file,

      unsigned int cmd, unsigned long arg);

static int sock_fasync(struct inode inode, struct file filp, int on);

//

/

* Socket files have a set of ’special’ operations as well as the generic file ones. These don’t appear

* in the operation structures but are done directly via the socketcall() multiplexor.

/

//文件file_operations 结构的初始化

static struct file_operations socket_file_ops = {

sock_lseek,

sock_read,

sock_write,

sock_readdir,

sock_select,

sock_ioctl,

NULL, / mmap /

NULL, / no special open code… /

sock_close,

NULL, / no fsync /

sock_fasync

};

以上file_operations结构定义了普通文件操作函数集。系统中每一个文件对应一个file结构，file结构中有一个file_operations变量，当使用write，read函数对某个文件描述符进行读写操作时，系统首先根据文件索引到其对应file，然后调用file_operations中对应的函数请求。

/

* The protocol list. Each protocol is registered in here.

/

/将在sock_register中初始化,对于不同操作域具有不同操作函数

  集，如对应INET域的inet_proto_ops操作函数集，对应unix的unix_proto_ops的操作

  /

static struct proto_ops pops[NPROTO];

/

* Statistics counters of the socket lists

/

/定义系统当前使用套接字数目/

static int sockets_in_use  = 0;

/

* Support routines. Move socket addresses back and forth across the kernel/user

* divide and look after the messy bits.

/

#define MAX_SOCK_ADDR 128 / 108 for Unix domain - 16 for IP, 16 for IPX, about 80 for AX.25 /

//数据移动到内核空间

static int move_addr_to_kernel(void uaddr, int ulen, void kaddr)

{

int err;

if(ulen<0||ulen>MAX_SOCK_ADDR)

return -EINVAL;

if(ulen==0)

return 0;

if((err=verify_area(VERIFY_READ,uaddr,ulen))<0)

return err;

memcpy_fromfs(kaddr,uaddr,ulen);

return 0;

}

//数据移动到用户空间

static int move_addr_to_user(void kaddr, int klen, void uaddr, int ulen)

{

int err;

int len;

if((err=verify_area(VERIFY_WRITE,ulen,sizeof(ulen)))<0)  //检测用户空间地址

return err;

len=get_fs_long(ulen);

if(len>klen)

len=klen;

if(len<0 || len> MAX_SOCK_ADDR)

return -EINVAL;

if(len)

{

if((err=verify_area(VERIFY_WRITE,uaddr,len))<0)

return err;

memcpy_tofs(uaddr,kaddr,len);

}

put_fs_long(len,ulen);

return 0;

}

/

* Obtains the first available file descriptor and sets it up for use.

/

//获得文件标识符，分配file数据结构

static int get_fd(struct inode inode)

{

int fd;

struct file file;

/

* Find a file descriptor suitable for return to the user.

/

    //获得一个文件描述给用户

   file = get_empty_filp();//分配一个file结构

if (!file)

return(-1);

for (fd = 0; fd < NR_OPEN; ++fd)

if (!current->files->fd[fd])

break;

if (fd == NR_OPEN)

{

file->f_count = 0;

return(-1);

}

FD_CLR(fd, ¤t->files->close_on_exec);

current->files->fd[fd] = file;

file->f_op = &socket_file_ops;

file->f_mode = 3;

file->f_flags = O_RDWR;

file->f_count = 1;

file->f_inode = inode;

if (inode)

inode->i_count++;

file->f_pos = 0;

return(fd);

}

/

* Go from an inode to its socket slot.

*

* The original socket implementation wasn’t very clever, which is

* why this exists at all..

/

inline struct socket socki_lookup(struct inode inode)

{

return &inode->u.socket_i;   //根据inode结构查找socket结构

}

/

* Go from a file number to its socket slot.

/

//从对应文件描述符得到找到对应file结构，进而得到

//inode结构，然后调用socki_lookup 返回socket

static inline struct socket sockfd_lookup(int fd, struct file **pfile)

{

struct file file;

struct inode inode;

if (fd < 0 || fd >= NR_OPEN || !(file = current->files->fd[fd]))

return NULL;

inode = file->f_inode;

if (!inode || !inode->i_sock)

return NULL;

if (pfile)

pfile = file;

return socki_lookup(inode);

}

/

* Allocate a socket.

/

//socket结构的分配，同时对结构进行初始化

struct socket sock_alloc(void)

{

struct inode * inode;

struct socket * sock;

      //获得一个空闲inode结构

inode = get_empty_inode();

if (!inode)

return NULL;

inode->i_mode = S_IFSOCK;

inode->i_sock = 1;

inode->i_uid = current->uid;

inode->i_gid = current->gid;

       //初始化socket结构

sock = &inode->u.socket_i;

sock->state = SS_UNCONNECTED;

sock->flags = 0;

sock->ops = NULL;

sock->data = NULL;

sock->conn = NULL;

sock->iconn = NULL;

sock->next = NULL;

sock->wait = &inode->i_wait;

sock->inode = inode; /* ”backlink”: we could use pointer arithmetic instead /

sock->fasync_list = NULL;

sockets_in_use++;

return sock;

}

/

* Release a socket.

/

//

static inline void sock_release_peer(struct socket peer)

{

peer->state = SS_DISCONNECTING;

wake_up_interruptible(peer->wait);

sock_wake_async(peer, 1);

}

//释放socket

void sock_release(struct socket sock)

{

int oldstate;

struct socket peersock, nextsock;

if ((oldstate = sock->state) != SS_UNCONNECTED)

sock->state = SS_DISCONNECTING;

/

* Wake up anyone waiting for connections.

/

for (peersock = sock->iconn; peersock; peersock = nextsock)

{

nextsock = peersock->next;

sock_release_peer(peersock);

}

/

* Wake up anyone we’re connected to. First, we release the

* protocol, to give it a chance to flush data, etc.

/

peersock = (oldstate == SS_CONNECTED) ? sock->conn : NULL;

if (sock->ops)

sock->ops->release(sock, peersock);

if (peersock)

sock_release_peer(peersock);

–sockets_in_use; / Bookkeeping.. /

iput(SOCK_INODE(sock));

}

/

* Sockets are not seekable.

/

//没有实现

static int sock_lseek(struct inode inode, struct file file, off_t offset, int whence)

{

return(-ESPIPE);

}

/

* Read data from a socket. ubuf is a user mode pointer. We make sure the user

* area ubuf…ubuf+size-1 is writable before asking the protocol.

/

//从套接字中读取数据，Ubuf是一个用户空间的指针，当确定协议之前，我们要确定用户空间地址

Ubuf到ubuf+size-1是可以写得。

static int sock_read(struct inode inode, struct file file, char ubuf, int size)

{

struct socket sock;

int err;

if (!(sock = socki_lookup(inode)))

{

printk(“NET: sock_read: can’t find socket for inode!\n”);

return(-EBADF);

}

if (sock->flags & SO_ACCEPTCON)

return(-EINVAL);

if(size<0)

return -EINVAL;

if(size==0)

return 0;

if ((err=verify_area(VERIFY_WRITE,ubuf,size))<0)   ///这一步实现检查。

   return err;

return(sock->ops->read(sock, ubuf, size, (file->f_flags & O_NONBLOCK)));

}

/

* Write data to a socket. We verify that the user area ubuf..ubuf+size-1 is

* readable by the user process.

/

static int sock_write(struct inode inode, struct file file, char ubuf, int size)

{

struct socket sock;

int err;

if (!(sock = socki_lookup(inode)))

{

printk(“NET: sock_write: can’t find socket for inode!\n”);

return(-EBADF);

}

if (sock->flags & SO_ACCEPTCON)

return(-EINVAL);

if(size<0)

return -EINVAL;

if(size==0)

return 0;

if ((err=verify_area(VERIFY_READ,ubuf,size))<0)

   return err;

return(sock->ops->write(sock, ubuf, size,(file->f_flags & O_NONBLOCK)));

}

/

* You can’t read directories from a socket!

/

static int sock_readdir(struct inode inode, struct file file, struct dirent dirent,

     int count)

{

return(-EBADF);

}

/*

* With an ioctl arg may well be a user mode pointer, but we don’t know what to do

* with it - thats up to the protocol still.

/

int sock_ioctl(struct inode inode, struct file file, unsigned int cmd,

   unsigned long arg)

{

struct socket sock;

if (!(sock = socki_lookup(inode)))

{

printk(“NET: sock_ioctl: can’t find socket for inode!\n”);

return(-EBADF);

}

   return(sock->ops->ioctl(sock, cmd, arg));

}

static int sock_select(struct inode inode, struct file file, int sel_type, select_table * wait)

{

struct socket sock;

if (!(sock = socki_lookup(inode)))

{

printk(“NET: sock_select: can’t find socket for inode!\n”);

return(0);

}

/

* We can’t return errors to select, so it’s either yes or no.

/

if (sock->ops && sock->ops->select)

return(sock->ops->select(sock, sel_type, wait));

return(0);

}

//socket关闭，调用顺序sock_release ->sock_release_peer

void sock_close(struct inode inode, struct file filp)

{

struct socket sock;

/*

* It’s possible the inode is NULL if we’re closing an unfinished socket.

/

if (!inode)

return;

if (!(sock = socki_lookup(inode)))

{

printk(“NET: sock_close: can’t find socket for inode!\n”);

return;

}

sock_fasync(inode, filp, 0);

sock_release(sock);

}

/

* Update the socket async list

/

//更新套接字同步列表

static int sock_fasync(struct inode inode, struct file filp, int on)

{

struct fasync_struct fa, *fna=NULL, **prev;

struct socket sock;

unsigned long flags;

if (on)  //根据on 来选择是否分配还是释放fasync_struct结构体

{

fna=(struct fasync_struct )kmalloc(sizeof(struct fasync_struct), GFP_KERNEL);

if(fna==NULL)

return -ENOMEM;

}

sock = socki_lookup(inode);

prev=&(sock->fasync_list);

save_flags(flags);

cli();

for(fa=prev; fa!=NULL; prev=&fa->fa_next,fa=prev)

if(fa->fa_file==filp)

break;

if(on)

{

if(fa!=NULL)

{

kfree_s(fna,sizeof(struct fasync_struct));

restore_flags(flags);

return 0;

}

fna->fa_file=filp;

fna->magic=FASYNC_MAGIC;

fna->fa_next=sock->fasync_list;

sock->fasync_list=fna;

}

else

{

if(fa!=NULL)

{

prev=fa->fa_next;

kfree_s(fa,sizeof(struct fasync_struct));

}

}

restore_flags(flags);

return 0;

}

//唤醒套接字,通过kill_fasync

int sock_wake_async(struct socket sock, int how)

{

if (!sock || !sock->fasync_list)

return -1;

switch (how)

{

case 0:

kill_fasync(sock->fasync_list, SIGIO);

break;

case 1:

if (!(sock->flags & SO_WAITDATA))

kill_fasync(sock->fasync_list, SIGIO);

break;

case 2:

if (sock->flags & SO_NOSPACE)

{

kill_fasync(sock->fasync_list, SIGIO);

sock->flags &= ~SO_NOSPACE;

}

break;

}

return 0;

}

/*

* Wait for a connection.

/

int sock_awaitconn(struct socket mysock, struct socket servsock, int flags)

{

struct socket last;

/*

* We must be listening

/

//检查服务器端是否处于监听状态，既可以进行连接

if (!(servsock->flags & SO_ACCEPTCON))

{

return(-EINVAL);

}

   /

   * Put ourselves on the server’s incomplete connection queue.

   /

mysock->next = NULL;

cli();

if (!(last = servsock->iconn))

servsock->iconn = mysock;

else

{

while (last->next)

last = last->next;

last->next = mysock;

}

mysock->state = SS_CONNECTING;

mysock->conn = servsock;

sti();

/

* Wake up server, then await connection. server will set state to

* SS_CONNECTED if we’re connected.

/

wake_up_interruptible(servsock->wait);

sock_wake_async(servsock, 0);

if (mysock->state != SS_CONNECTED)

{

if (flags & O_NONBLOCK)

return -EINPROGRESS;

interruptible_sleep_on(mysock->wait);

if (mysock->state != SS_CONNECTED &&

    mysock->state != SS_DISCONNECTING)

{

/

* if we’re not connected we could have been

* 1) interrupted, so we need to remove ourselves

*    from the server list

* 2) rejected (mysock->conn == NULL), and have

*    already been removed from the list

/

if (mysock->conn == servsock)

{

cli();

if ((last = servsock->iconn) == mysock)

servsock->iconn = mysock->next;

else

{

while (last->next != mysock)

last = last->next;

last->next = mysock->next;

}

sti();

}

return(mysock->conn ? -EINTR : -EACCES);

}

}

return(0);

}

/

* Perform the socket system call. we locate the appropriate

* family, then create a fresh socket.

/

//调用邋BSD层对应实现函数

static int sock_socket(int family, int type, int protocol)

{

int i, fd;

struct socket sock;

struct proto_ops ops;

/ Locate the correct protocol family. /

for (i = 0; i < NPROTO; ++i)

{

if (pops[i] == NULL) continue;

//判断用那种类型family ，如INET_proto_ops,unix_proto_ops

if (pops[i]->family == family)

break;

}

if (i == NPROTO)

{

   return -EINVAL;

}

      //把对应的操作传给ops

ops = pops[i];

/

* Check that this is a type that we know how to manipulate and

* the protocol makes sense here. The family can still reject the

* protocol later.

/

if ((type != SOCK_STREAM && type != SOCK_DGRAM &&

type != SOCK_SEQPACKET && type != SOCK_RAW &&

type != SOCK_PACKET) || protocol < 0)

return(-EINVAL);

/

* Allocate the socket and allow the family to set things up. if

* the protocol is 0, the family is instructed to select an appropriate

* default.

/

//分配socket结构

if (!(sock = sock_alloc()))

{

printk(“NET: sock_socket: no more sockets\n”);

return(-ENOSR); / Was: EAGAIN, but we are out of

   system resources! /

}

sock->type = type;

sock->ops = ops;

if ((i = sock->ops->create(sock, protocol)) < 0)

{

sock_release(sock);

return(i);

}

   //分配fd ，file结构

if ((fd = get_fd(SOCK_INODE(sock))) < 0)

{

sock_release(sock);

return(-EINVAL);

}

return(fd);

}

/

* Create a pair of connected sockets.

/

//只用于unix域，用于2个进程间通过套接字进行联系数据传送

//这个函数用于本机内模拟网络方式进程间通信

static int sock_socketpair(int family, int type, int protocol, unsigned long usockvec[2])

{

int fd1, fd2, i;

struct socket sock1, sock2;

int er;

/

* Obtain the first socket and check if the underlying protocol

* supports the socketpair call.

/

if ((fd1 = sock_socket(family, type, protocol)) < 0)

return(fd1);

sock1 = sockfd_lookup(fd1, NULL);

if (!sock1->ops->socketpair)

{

sys_close(fd1);

return(-EINVAL);

}

/

* Now grab another socket and try to connect the two together.

/

if ((fd2 = sock_socket(family, type, protocol)) < 0)

{

sys_close(fd1);

return(-EINVAL);

}

sock2 = sockfd_lookup(fd2, NULL);

if ((i = sock1->ops->socketpair(sock1, sock2)) < 0)

{

sys_close(fd1);

sys_close(fd2);

return(i);

}

sock1->conn = sock2;

sock2->conn = sock1;

sock1->state = SS_CONNECTED;

sock2->state = SS_CONNECTED;

er=verify_area(VERIFY_WRITE, usockvec, 2  sizeof(int));

if(er)

{

sys_close(fd1);

sys_close(fd2);

return er;

}

put_fs_long(fd1, &usockvec[0]);

put_fs_long(fd2, &usockvec[1]);

return(0);

}

/*

* Bind a name to a socket. Nothing much to do here since it’s

* the protocol’s responsibility to handle the local address.

*

* We move the socket address to kernel space before we call

* the protocol layer (having also checked the address is ok).

/

static int sock_bind(int fd, struct sockaddr umyaddr, int addrlen)

{

struct socket sock;

int i;

char address[MAX_SOCK_ADDR];

int err;

if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)

return(-EBADF);

//通过fd获取对应的socket结构

if (!(sock = sockfd_lookup(fd, NULL)))

return(-ENOTSOCK);

       //将数据从用户缓冲区移到内核缓冲区

if((err=move_addr_to_kernel(umyaddr,addrlen,address))<0)

   return err;

if ((i = sock->ops->bind(sock, (struct sockaddr )address, addrlen)) < 0)

{

return(i);

}

return(0);

}

/*

* Perform a listen. Basically, we allow the protocol to do anything

* necessary for a listen, and if that works, we mark the socket as

* ready for listening.

/

static int sock_listen(int fd, int backlog)

{

struct socket sock;

if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)

return(-EBADF);

if (!(sock = sockfd_lookup(fd, NULL)))

return(-ENOTSOCK);

       //判断状态

if (sock->state != SS_UNCONNECTED)

{

return(-EINVAL);

}

if (sock->ops && sock->ops->listen)

sock->ops->listen(sock, backlog);

//标志位设为监听

sock->flags |= SO_ACCEPTCON;

return(0);

}

/*

* For accept, we attempt to create a new socket, set up the link

* with the client, wake up the client, then return the new

* connected fd. We collect the address of the connector in kernel

* space and move it to user at the very end. This is buggy because

* we open the socket then return an error.

/

static int sock_accept(int fd, struct sockaddr upeer_sockaddr, int upeer_addrlen)

{

struct file file;

struct socket sock, newsock;

int i;

char address[MAX_SOCK_ADDR];

int len;

if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))

return(-EBADF);

   if (!(sock = sockfd_lookup(fd, &file)))

return(-ENOTSOCK);

if (sock->state != SS_UNCONNECTED)

{

return(-EINVAL);

}

if (!(sock->flags & SO_ACCEPTCON))

{

return(-EINVAL);

}

if (!(newsock = sock_alloc()))

{

printk(“NET: sock_accept: no more sockets\n”);

return(-ENOSR); /* Was: EAGAIN, but we are out of system

   resources! /

}

newsock->type = sock->type;

newsock->ops = sock->ops;

//建立新的套接字，用于与监听套接字通信

//初始化信息和原监听套接字一样

if ((i = sock->ops->dup(newsock, sock)) < 0)

{

sock_release(newsock);

return(i);

}

i = newsock->ops->accept(sock, newsock, file->f_flags);

if ( i < 0)

{

sock_release(newsock);

return(i);

}

          //返回一个新的fd，便于通信

if ((fd = get_fd(SOCK_INODE(newsock))) < 0)

{

sock_release(newsock);

return(-EINVAL);

}

if (upeer_sockaddr)

{

      //从请求连接中数据包中取得远端地址

newsock->ops->getname(newsock, (struct sockaddr )address, &len, 1);

//复制到用户缓冲区

move_addr_to_user(address,len, upeer_sockaddr, upeer_addrlen);

}

return(fd);

}

/*

* Attempt to connect to a socket with the server address.  The address

* is in user space so we verify it is OK and move it to kernel space.

/

/该函数首先将要链接的的远程地址从用户缓冲区复制到内核缓冲区，之后根据套接字之前的状态采取措施，如果完成套接字调用函数，则简单返回EISCONN. 如果状态有效则调用sock->ops->connect 函数完成具体连接。

/

static int sock_connect(int fd, struct sockaddr uservaddr, int addrlen)

{

struct socket sock;

struct file file;

int i;

char address[MAX_SOCK_ADDR];

int err;

if (fd < 0 || fd >= NR_OPEN || (file=current->files->fd[fd]) == NULL)

return(-EBADF);

if (!(sock = sockfd_lookup(fd, &file)))

return(-ENOTSOCK);

if((err=move_addr_to_kernel(uservaddr,addrlen,address))<0)

   return err;

switch(sock->state)

{

case SS_UNCONNECTED:

/* This is ok… continue with connect /

break;

case SS_CONNECTED:

/ Socket is already connected /

if(sock->type == SOCK_DGRAM) / Hack for now - move this all into the protocol /

break;

return -EISCONN;

case SS_CONNECTING:

/ Not yet connected… we will check this. /

/

* FIXME:  for all protocols what happens if you start

* an async connect fork and both children connect. Clean

* this up in the protocols!

/

break;

default:

return(-EINVAL);

}

i = sock->ops->connect(sock, (struct sockaddr )address, addrlen, file->f_flags);

if (i < 0)

{

return(i);

}

return(0);

}

/*

* Get the local address (‘name’) of a socket object. Move the obtained

* name to user space.

/

//获得本地地址，并把从内核空间移动到用户空间

static int sock_getsockname(int fd, struct sockaddr usockaddr, int usockaddr_len)

{

struct socket sock;

char address[MAX_SOCK_ADDR];

int len;

int err;

if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)

return(-EBADF);

if (!(sock = sockfd_lookup(fd, NULL)))

return(-ENOTSOCK);

     //调用af_inet.c中inet_getname

err=sock->ops->getname(sock, (struct sockaddr )address, &len, 0);

if(err)

return err;

if((err=move_addr_to_user(address,len, usockaddr, usockaddr_len))<0)

   return err;

return 0;

}

/

* Get the remote address (‘name’) of a socket object. Move the obtained

* name to user space.

/

//获取远端地址（ip地址和端口号）

static int sock_getpeername(int fd, struct sockaddr usockaddr, int usockaddr_len)

{

struct socket sock;

char address[MAX_SOCK_ADDR];

int len;

int err;

if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)

return(-EBADF);

if (!(sock = sockfd_lookup(fd, NULL)))

return(-ENOTSOCK);

err=sock->ops->getname(sock, (struct sockaddr )address, &len, 1);

if(err)

   return err;

if((err=move_addr_to_user(address,len, usockaddr, usockaddr_len))<0)

   return err;

return 0;

}

/

* Send a datagram down a socket. The datagram as with write() is

* in user space. We check it can be read.

/

static int sock_send(int fd, void  buff, int len, unsigned flags)

{

struct socket sock;

struct file file;

int err;

if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))

return(-EBADF);

if (!(sock = sockfd_lookup(fd, NULL)))

return(-ENOTSOCK);

if(len<0)

return -EINVAL;

err=verify_area(VERIFY_READ, buff, len);

if(err)

return err;

return(sock->ops->send(sock, buff, len, (file->f_flags & O_NONBLOCK), flags));

}由于

/*

* Send a datagram to a given address. We move the address into kernel

* space and check the user space data area is readable before invoking

* the protocol.

/

static int sock_sendto(int fd, void  buff, int len, unsigned flags,

   struct sockaddr addr, int addr_len)

{

struct socket sock;

struct file file;

char address[MAX_SOCK_ADDR];

int err;

if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))

return(-EBADF);

if (!(sock = sockfd_lookup(fd, NULL)))

return(-ENOTSOCK);

if(len<0)

return -EINVAL;

err=verify_area(VERIFY_READ,buff,len);

if(err)

   return err;

if((err=move_addr_to_kernel(addr,addr_len,address))<0)

   return err;

return(sock->ops->sendto(sock, buff, len, (file->f_flags & O_NONBLOCK),

flags, (struct sockaddr )address, addr_len));

}

Send和Sendto区别？：

Sendto可以指定远端地址。而send不能。

对于TCp来说，指定远端地址之前必须建立建立连接远端地址。

而udp则不用。

·····································································································

/*

* Receive a datagram from a socket. This isn’t really right. The BSD manual

* pages explicitly state that recv is recvfrom with a NULL to argument. The

* Linux stack gets the right results for the wrong reason and this need to

* be tidied in the inet layer and removed from here.

* We check the buffer is writable and valid.

/

static int sock_recv(int fd, void  buff, int len, unsigned flags)

{

struct socket sock;

struct file file;

int err;

if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))

return(-EBADF);

if (!(sock = sockfd_lookup(fd, NULL)))

return(-ENOTSOCK);

if(len<0)

return -EINVAL;

if(len==0)

return 0;

err=verify_area(VERIFY_WRITE, buff, len);

if(err)

return err;

return(sock->ops->recv(sock, buff, len,(file->f_flags & O_NONBLOCK), flags));

}

/*

* Receive a frame from the socket and optionally record the address of the

* sender. We verify the buffers are writable and if needed move the

* sender address from kernel to user space.

/

static int sock_recvfrom(int fd, void  buff, int len, unsigned flags,

     struct sockaddr addr, int addr_len)

{

struct socket sock;

struct file file;

char address[MAX_SOCK_ADDR];

int err;

int alen;

if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))

return(-EBADF);

if (!(sock = sockfd_lookup(fd, NULL)))

   return(-ENOTSOCK);

if(len<0)

return -EINVAL;

if(len==0)

return 0;

err=verify_area(VERIFY_WRITE,buff,len);

if(err)

   return err;

len=sock->ops->recvfrom(sock, buff, len, (file->f_flags & O_NONBLOCK),

     flags, (struct sockaddr )address, &alen);

if(len<0)

return len;

if(addr!=NULL && (err=move_addr_to_user(address,alen, addr, addr_len))<0)

   return err;

return len;

}

Sock_recv和sock_recvfrom区别：

Sock_recvfrom可以同时返回远端地址

·····································································································

/

* Set a socket option. Because we don’t know the option lengths we have

* to pass the user mode parameter for the protocols to sort out.

/

static int sock_setsockopt(int fd, int level, int optname, char optval, int optlen)

{

struct socket sock;

struct file file;

if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))

return(-EBADF);

if (!(sock = sockfd_lookup(fd, NULL)))

return(-ENOTSOCK);

return(sock->ops->setsockopt(sock, level, optname, optval, optlen));

}

/*

* Get a socket option. Because we don’t know the

option lengths we have

* to pass a user mode parameter for the protocols to sort out.

/

static int sock_getsockopt(int fd, int level, int optname, char optval, int optlen)

{

struct socket sock;

struct file file;

if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))

return(-EBADF);

if (!(sock = sockfd_lookup(fd, NULL)))

return(-ENOTSOCK);

if (!sock->ops || !sock->ops->getsockopt)

return(0);

return(sock->ops->getsockopt(sock, level, optname, optval, optlen));

}

/

* Shutdown a socket.

/

static int sock_shutdown(int fd, int how)

{

struct socket sock;

struct file file;

if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))

return(-EBADF);

if (!(sock = sockfd_lookup(fd, NULL)))

return(-ENOTSOCK);

return(sock->ops->shutdown(sock, how));

}

/

* Perform a file control on a socket file descriptor.

/

int sock_fcntl(struct file filp, unsigned int cmd, unsigned long arg)

{

struct socket sock;

sock = socki_lookup (filp->f_inode);

if (sock != NULL && sock->ops != NULL && sock->ops->fcntl != NULL)

return(sock->ops->fcntl(sock, cmd, arg));

return(-EINVAL);

}

/

* System call vectors. Since I (RIB) want to rewrite sockets as streams,

* we have this level of indirection. Not a lot of overhead, since more of

* the work is done via read/write/select directly.

*

* I’m now expanding this up to a higher level to separate the assorted

* kernel/user space manipulations and global assumptions from the protocol

* layers proper - AC.

/

/功能:系统调用的入门函数

输入:call表示具体被调用的应用层的接口函数（如 bind）

输出:

/

asmlinkage int sys_socketcall(int call, unsigned long args)

{

int er;

switch(call)

{

case SYS_SOCKET:

er=verify_area(VERIFY_READ, args, 3 * sizeof(long));

if(er)

return er;

return(sock_socket(get_fs_long(args+0),

get_fs_long(args+1),

get_fs_long(args+2)));

case SYS_BIND:

er=verify_area(VERIFY_READ, args, 3 * sizeof(long));

if(er)

return er;

return(sock_bind(get_fs_long(args+0),

(struct sockaddr )get_fs_long(args+1),

get_fs_long(args+2)));

case SYS_CONNECT:

er=verify_area(VERIFY_READ, args, 3  sizeof(long));

if(er)

return er;

return(sock_connect(get_fs_long(args+0),

(struct sockaddr )get_fs_long(args+1),

get_fs_long(args+2)));

case SYS_LISTEN:

er=verify_area(VERIFY_READ, args, 2  sizeof(long));

if(er)

return er;

return(sock_listen(get_fs_long(args+0),

get_fs_long(args+1)));

case SYS_ACCEPT:

er=verify_area(VERIFY_READ, args, 3 * sizeof(long));

if(er)

return er;

return(sock_accept(get_fs_long(args+0),

(struct sockaddr )get_fs_long(args+1),

(int )get_fs_long(args+2)));

case SYS_GETSOCKNAME:

er=verify_area(VERIFY_READ, args, 3 * sizeof(long));

if(er)

return er;

return(sock_getsockname(get_fs_long(args+0),

(struct sockaddr )get_fs_long(args+1),

(int )get_fs_long(args+2)));

case SYS_GETPEERNAME:

er=verify_area(VERIFY_READ, args, 3 * sizeof(long));

if(er)

return er;

return(sock_getpeername(get_fs_long(args+0),

(struct sockaddr )get_fs_long(args+1),

(int )get_fs_long(args+2)));

case SYS_SOCKETPAIR:

er=verify_area(VERIFY_READ, args, 4 * sizeof(long));

if(er)

return er;

return(sock_socketpair(get_fs_long(args+0),

get_fs_long(args+1),

get_fs_long(args+2),

(unsigned long )get_fs_long(args+3)));

case SYS_SEND:

er=verify_area(VERIFY_READ, args, 4  sizeof(unsigned long));

if(er)

return er;

return(sock_send(get_fs_long(args+0),

(void )get_fs_long(args+1),

get_fs_long(args+2),

get_fs_long(args+3)));

case SYS_SENDTO:

er=verify_area(VERIFY_READ, args, 6  sizeof(unsigned long));

if(er)

return er;

return(sock_sendto(get_fs_long(args+0),

(void )get_fs_long(args+1),

get_fs_long(args+2),

get_fs_long(args+3),

(struct sockaddr )get_fs_long(args+4),

get_fs_long(args+5)));

case SYS_RECV:

er=verify_area(VERIFY_READ, args, 4 * sizeof(unsigned long));

if(er)

return er;

return(sock_recv(get_fs_long(args+0),

(void )get_fs_long(args+1),

get_fs_long(args+2),

get_fs_long(args+3)));

case SYS_RECVFROM:

er=verify_area(VERIFY_READ, args, 6  sizeof(unsigned long));

if(er)

return er;

return(sock_recvfrom(get_fs_long(args+0),

(void )get_fs_long(args+1),

get_fs_long(args+2),

get_fs_long(args+3),

(struct sockaddr )get_fs_long(args+4),

(int )get_fs_long(args+5)));

case SYS_SHUTDOWN:

er=verify_area(VERIFY_READ, args, 2 sizeof(unsigned long));

if(er)

return er;

return(sock_shutdown(get_fs_long(args+0),

get_fs_long(args+1)));

case SYS_SETSOCKOPT:

er=verify_area(VERIFY_READ, args, 5sizeof(unsigned long));

if(er)

return er;

return(sock_setsockopt(get_fs_long(args+0),

get_fs_long(args+1),

get_fs_long(args+2),

(char )get_fs_long(args+3),

get_fs_long(args+4)));

case SYS_GETSOCKOPT:

er=verify_area(VERIFY_READ, args, 5sizeof(unsigned long));

if(er)

return er;

return(sock_getsockopt(get_fs_long(args+0),

get_fs_long(args+1),

get_fs_long(args+2),

(char )get_fs_long(args+3),

(int )get_fs_long(args+4)));

default:

return(-EINVAL);

}

}

/

* This function is called by a protocol handler that wants to

* advertise its address family, and have it linked into the

* SOCKET module.

/

/对于不同family具有不同的操作集/

int sock_register(int family, struct proto_ops ops)

{

int i;

cli();

for(i = 0; i < NPROTO; i++)

{

if (pops[i] != NULL)

continue;

pops[i] = ops;

pops[i]->family = family;

sti();

return(i);

}

sti();

return(-ENOMEM);

}

/*

* This function is called by a protocol handler that wants to

* remove its address family, and have it unlinked from the

* SOCKET module.

/

int sock_unregister(int family)

{

int i;

cli();

for(i = 0; i < NPROTO; i++)

{

if (pops[i] == NULL)

continue;

if(pops[i]->family == family)

{

pops[i]=NULL;

sti();

return(i);

}

}

sti();

return(-ENOENT);

}

  //网络部分协议初始化，

void proto_init(void)

{

extern struct net_proto protocols[]; / Network protocols /

struct net_proto pro;

/* Kick all configured protocols. /

pro = protocols;

while (pro->name != NULL)

{

(pro->init_func)(pro);

pro++;

}

/* We’re all done… /

}

//系统网络栈初始化总入口函数，在start_kernel函数中被调用对整个网络栈进行初始化

void sock_init(void)

{

int i;

printk(“Swansea University Computer Society NET3.019\n”);

/

* Initialize all address (protocol) families.

/

for (i = 0; i < NPROTO; ++i) pops[i] = NULL;

/

* Initialize the protocols module.

/

     //网络协议初始化

proto_init();

#ifdef CONFIG_NET

/

* Initialize the DEV module.

/

     //网卡驱动初始化和操作的下半部分初始化

dev_init();

/

* And the bottom half handler

/

bh_base[NET_BH].routine= net_bh;

enable_bh(NET_BH);

#endif

}

int socket_get_info(char buffer, char **start, off_t offset, int length)

{

int len = sprintf(buffer, ”sockets: used %d\n”, sockets_in_use);

if (offset >= len)

{

start = buffer;

return 0;

}

start = buffer + offset;

len -= offset;

if (len > length)

len = length;

return len;

}

<基于上的作品创作,转载请注明！>

阅读(1768) | 评论(0) | 转发(0) |

0

上一篇：网络协议栈实现分析<2>–套接字系统调用

下一篇：网络协议栈实现分析<4>–两个重要数据结构

给主人留下些什么吧！~~

关于我们 | 关于IT168 | 联系方式 | 广告合作 | 法律声明 | 免费注册

Copyright 2001-2010 ChinaUnix.net All Rights Reserved 北京皓辰网域网络信息技术有限公司. 版权所有

感谢所有关心和支持过ChinaUnix的朋友们