bind的实现:
先来介绍几个地址结构.
struct sockaddr 其实相当于一个基类的地址结构,其他的结构都能够直接转到sockaddr.举个例子比如当sa_family为PF_INET时,sa_data就包含了端口号和ip地址(in_addr结构).
- struct sockaddr {
- sa_family_t sa_family;
- char sa_data[14];
- };
接下来就是sockaddr_in ,它表示了所有的ipv4的地址结构.可以看到他也就相当于sockaddr 的一个子类.
- struct sockaddr_in {
- sa_family_t sin_family;
- __be16 sin_port;
- struct in_addr sin_addr;
-
- unsigned char __pad[__SOCK_SIZE__ - sizeof(short int) -
- sizeof(unsigned short int) - sizeof(struct in_addr)];
- };
这里还有一个内核比较新的地质结构sockaddr_storage,他可以容纳所有类型的套接口结构,比如ipv4,ipv6..可以看到它是强制对齐的,相比于sockaddr.
- struct __kernel_sockaddr_storage {
- unsigned short ss_family;
-
- char __data[_K_SS_MAXSIZE - sizeof(unsigned short)];
-
-
- } __attribute__ ((aligned(_K_SS_ALIGNSIZE)));
接下来看几个和bind相关的数据结构:
第一个是inet_hashinfo,它主要用来管理 tcp的bind hash
bucket(在tcp的初始化函数中会将tcp_hashinfo初始化.然后在tcp_prot中会将tcp_hashinfo付给结构体h,然后相
应的我们就可以通过sock中的sock_common域来存取这个值).后面我们会分析这个流程.
- struct inet_hashinfo {
-
-
-
-
-
-
-
-
- struct inet_ehash_bucket *ehash;
- rwlock_t *ehash_locks;
- unsigned int ehash_size;
- unsigned int ehash_locks_mask;
-
-
-
-
-
- struct inet_bind_hashbucket *bhash;
-
- unsigned int bhash_size;
-
-
-
-
-
-
-
- struct hlist_head listening_hash[INET_LHTABLE_SIZE];
-
-
-
-
-
-
-
- rwlock_t lhash_lock ____cacheline_aligned;
- atomic_t lhash_users;
- wait_queue_head_t lhash_wait;
- struct kmem_cache *bind_bucket_cachep;
- };
struct inet_ehash_bucket管理所有的tcp状态在TCP_ESTABLISHED和TCP_CLOSE之间的socket.这里要注意,twchain表示处于TIME_WAIT的socket.
- struct inet_ehash_bucket {
- struct hlist_head chain;
- struct hlist_head twchain;
- };
inet_bind_bucket结构就是每个使用的端口的信息,最终会把它链接到bhash链表中.
- struct inet_bind_bucket {
- struct net *ib_net;
-
- unsigned short port;
-
- signed short fastreuse;
-
- struct hlist_node node;
-
- struct hlist_head owners;
- };
最后一个结构是tcp_hashinfo他在 tcp_init中被初始化,而tcp_init是在inet_init中被初始化的.然后tcp_hashinfo会被赋值给tcp_proto和sock的sk_prot域.
- struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
- .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
- .lhash_users = ATOMIC_INIT(0),
- .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
- };
然后来看bind的实现,bind对应的系统调用是sys_bind:
- asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
- {
- struct socket *sock;
- struct sockaddr_storage address;
- int err, fput_needed;
-
-
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (sock) {
-
- err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
- if (err >= 0) {
- err = security_socket_bind(sock,
- (struct sockaddr *)&address,
- addrlen);
- if (!err)
-
- err = sock->ops->bind(sock,
- (struct sockaddr *)
- &address, addrlen);
- }
-
- fput_light(sock->file, fput_needed);
- }
- return err;
- }
sockfd_lookup_light主要是查找fd对应的socket
- static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
- {
- struct file *file;
- struct socket *sock;
-
- *err = -EBADF;
-
- file = fget_light(fd, fput_needed);
- if (file) {
-
- sock = sock_from_file(file, err);
- if (sock)
- return sock;
- fput_light(file, *fput_needed);
- }
- return NULL;
- }
然后来看inet_bind的实现.
- int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
- {
-
- struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
- struct sock *sk = sock->sk;
- struct inet_sock *inet = inet_sk(sk);
- unsigned short snum;
- int chk_addr_ret;
- int err;
-
-
- if (sk->sk_prot->bind) {
- err = sk->sk_prot->bind(sk, uaddr, addr_len);
- goto out;
- }
- err = -EINVAL;
- if (addr_len < sizeof(struct sockaddr_in))
- goto out;
-
- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
-
- err = -EADDRNOTAVAIL;
-
-
- if (!sysctl_ip_nonlocal_bind &&
- !inet->freebind &&
- addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
- chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST &&
- chk_addr_ret != RTN_BROADCAST)
- goto out;
-
- snum = ntohs(addr->sin_port);
- err = -EACCES;
-
- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
- goto out;
-
-
-
-
-
-
-
-
- lock_sock(sk);
-
-
- err = -EINVAL;
-
- if (sk->sk_state != TCP_CLOSE || inet->num)
- goto out_release_sock;
-
-
- inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
-
- if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
- inet->saddr = 0;
-
-
- if (sk->sk_prot->get_port(sk, snum)) {
- inet->saddr = inet->rcv_saddr = 0;
- err = -EADDRINUSE;
- goto out_release_sock;
- }
-
- if (inet->rcv_saddr)
- sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
- if (snum)
- sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
-
- inet->sport = htons(inet->num);
-
- inet->daddr = 0;
- inet->dport = 0;
- sk_dst_reset(sk);
- err = 0;
- out_release_sock:
- release_sock(sk);
- out:
- return err;
- }
这里我先来介绍下inet_csk_get_port的流程.
当绑定的port为0时,这时也就是说需要kernel来分配一个新的port.
1 首先得到系统的port范围.
2 随机分配一个port.
3 从bhash中得到当前随机分配的端口的链表(也就是inet_bind_bucket链表).
4 遍历这个链表(链表为空的话,也说明这个port没有被使用),如果这个端口已经被使用,则将端口号加一,继续循环,直到找到当前没有被使用的port,也就是没有在bhash中存在的port.
5 新建一个inet_bind_bucket,并插入到bhash中.
当指定port时.
1 从bhash中根据hash值(port计算的)取得当前指定端口对应的inet_bind_bucket结构.
2 如果bhash中存在,则说明,这个端口已经在使用,因此需要判断这个端口是否允许被reuse.
3 如果不存在,则步骤和上面的第5部一样.
- int inet_csk_get_port(struct sock *sk, unsigned short snum)
- {
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- struct inet_bind_hashbucket *head;
- struct hlist_node *node;
- struct inet_bind_bucket *tb;
- int ret;
- struct net *net = sock_net(sk);
-
- local_bh_disable();
- if (!snum) {
-
- int remaining, rover, low, high;
-
- inet_get_local_port_range(&low, &high);
- remaining = (high - low) + 1;
- rover = net_random() % remaining + low;
-
-
- do {
-
- head = &hashinfo->bhash[inet_bhashfn(net, rover,
- hashinfo->bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (tb->ib_net == net && tb->port == rover)
-
- goto next;
- break;
- next:
- spin_unlock(&head->lock);
-
- if (++rover > high)
- rover = low;
- } while (--remaining > 0);
-
-
-
-
-
-
-
- ret = 1;
- if (remaining <= 0)
- goto fail;
-
- snum = rover;
- } else {
-
- head = &hashinfo->bhash[inet_bhashfn(net, snum,
- hashinfo->bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, node, &head->chain)
- if (tb->ib_net == net && tb->port == snum)
- goto tb_found;
- }
- tb = NULL;
- goto tb_not_found;
- tb_found:
-
- if (!hlist_empty(&tb->owners)) {
-
- if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
- goto success;
- } else {
- ret = 1;
-
- if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb))
- goto fail_unlock;
- }
- }
- tb_not_found:
- ret = 1;
-
- if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
- net, head, snum)) == NULL)
- goto fail_unlock;
- if (hlist_empty(&tb->owners)) {
-
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
- tb->fastreuse = 1;
- else
- tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
- success:
-
- if (!inet_csk(sk)->icsk_bind_hash)
- inet_bind_hash(sk, tb, snum);
- WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
- ret = 0;
-
- fail_unlock:
- spin_unlock(&head->lock);
- fail:
- local_bh_enable();
- return ret;
- }
在看listen的代码之前.我们也先来看相关的数据结构:
其中inet_connection_sock我们先前已经介绍过了,它包含了一个icsk_accept_queue的域,这个域是一个request_sock_queue类型,.我们就先来看这个结构:
request_sock_queue也就表示一个request_sock队列.这里我们知道,tcp中分为半连接队列(处于
SYN_RECVD状态)和已完成连接队列(处于established状态).这两个一个是刚接到syn,等待三次握手完成,一个是已经完成三次握手,
等待accept来读取.
这里每个syn分节到来都会新建一个request_sock结构,并将它加入到listen_sock的request_sock
hash表中.然后3次握手完毕后,将它放入到request_sock_queue的rskq_accept_head和
rskq_accept_tail队列中.这样当accept的时候就直接从这个队列中读取了.
- struct request_sock_queue {
-
- struct request_sock *rskq_accept_head;
- struct request_sock *rskq_accept_tail;
- rwlock_t syn_wait_lock;
- u8 rskq_defer_accept;
-
-
- struct listen_sock *listen_opt;
- };
listen_sock 表示一个处于listening状态的socket.
- struct listen_sock {
-
- u8 max_qlen_log;
-
-
- int qlen;
-
- int qlen_young;
- int clock_hand;
- u32 hash_rnd;
-
- u32 nr_table_entries;
-
- struct request_sock *syn_table[0];
- };
最后来看下request_sock,它保存了tcp双方传输所必需的一些域,比如窗口大小,对端速率,对端数据包序列号等等这些值.
- struct request_sock {
- struct request_sock *dl_next;
-
- u16 mss;
- u8 retrans;
- u8 cookie_ts;
-
- u32 window_clamp;
-
- u32 rcv_wnd;
- u32 ts_recent;
- unsigned long expires;
-
- const struct request_sock_ops *rsk_ops;
- struct sock *sk;
- u32 secid;
- u32 peer_secid;
- };
listen的对应的系统调用是sys_listen,它首先通过sockfd_lookup_light查找到相应的socket,然后调用inet_listen,大体流程和bind差不多,只不过中间调用的是inet_listen罢了.
这里还有一个概念那就是backlog,在linux中,backlog的大小指的是已完成连接队列的大小.而不是和半连接队列之和.而半开连接的大小一般是和backlog差不多大小.
而半开连接队列的最大长度是根据backlog计算的,我们后面会介绍这个.
因此我们直接来看inet_listen的实现,这个函数主要是进行一些合法性判断,然后调用inet_csk_listen_start来对相关域进行处理:
- int inet_listen(struct socket *sock, int backlog)
- {
- struct sock *sk = sock->sk;
- unsigned char old_state;
- int err;
-
- lock_sock(sk);
-
- err = -EINVAL;
-
- if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
- goto out;
-
- old_state = sk->sk_state;
-
- if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
- goto out;
-
-
-
-
-
- if (old_state != TCP_LISTEN) {
- err = inet_csk_listen_start(sk, backlog);
- if (err)
- goto out;
- }
-
- sk->sk_max_ack_backlog = backlog;
- err = 0;
-
- out:
- release_sock(sk);
- return err;
- }
然后来看inet_csk_listen_start的实现.
它的主要工作是新分配一个listen socket,将它加入到inet_connection_sock的icsk_accept_queue域的listen_opt中.然后对当前使用端口进行判断.最终返回:
- int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
- {
- struct inet_sock *inet = inet_sk(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
-
- if (rc != 0)
- return rc;
-
- sk->sk_max_ack_backlog = 0;
- sk->sk_ack_backlog = 0;
- inet_csk_delack_init(sk);
-
-
-
-
-
-
-
- sk->sk_state = TCP_LISTEN;
-
- if (!sk->sk_prot->get_port(sk, inet->num)) {
-
- inet->sport = htons(inet->num);
-
- sk_dst_reset(sk);
-
- sk->sk_prot->hash(sk);
-
- return 0;
- }
-
- sk->sk_state = TCP_CLOSE;
- __reqsk_queue_destroy(&icsk->icsk_accept_queue);
- return -EADDRINUSE;
- }
最后我们来看下reqsk_queue_alloc的实现:
-
- int sysctl_max_syn_backlog = 256;
-
- int reqsk_queue_alloc(struct request_sock_queue *queue,
- unsigned int nr_table_entries)
- {
- size_t lopt_size = sizeof(struct listen_sock);
- struct listen_sock *lopt;
-
- nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
-
-
- nr_table_entries = max_t(u32, nr_table_entries, 8);
-
-
- nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
-
- lopt_size += nr_table_entries * sizeof(struct request_sock *);
- if (lopt_size > PAGE_SIZE)
- lopt = __vmalloc(lopt_size,
- GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL);
- else
- lopt = kzalloc(lopt_size, GFP_KERNEL);
- if (lopt == NULL)
- return -ENOMEM;
-
- for (lopt->max_qlen_log = 3;
- (1 << lopt->max_qlen_log) < nr_table_entries;
- lopt->max_qlen_log++);
-
- get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
- rwlock_init(&queue->syn_wait_lock);
- queue->rskq_accept_head = NULL;
-
- lopt->nr_table_entries = nr_table_entries;
-
- write_lock_bh(&queue->syn_wait_lock);
-
- queue->listen_opt = lopt;
- write_unlock_bh(&queue->syn_wait_lock);
-
- return 0;
- }
首先来看下内核如何处理3次握手的半连接队列和accept队列(其实也就是server端的三次握手的状态变换).而半连接队列和accept队列在内核如何表示,我们上次已经介绍过了,这里就不介绍了.
首先我们知道当3层的数据包到达之后会调用4层的协议handle,tcp的话就是tcp_v4_rcv.如何调用可以看我前面的blog:
而在tcp_v4_rcv中,则最终会调用tcp_v4_do_rcv来处理输入数据包.在看tcp_v4_do_rcv之前,我们先来看在tcp_v4_rcv中,内核如何通过4元组(目的,源端口和地址)来查找对应得sock对象.
在分析之前,我们要知道,当一对tcp连接3次握手完毕后,内核将会重新new一个socket,这个socket中的大部分域都是与主socket相同的.而把这个新的socket的状态设置为established,而主socket的状态依旧为listen状态.
而通过前面的blog分析,我们也知道在inet_hashinfo中将处于listening状态的socket和处于
TCP_ESTABLISHED与TCP_CLOSE之间的状态的socket是分开的,一个是ehash,一个是listening_hash.因此通
过对应的4元组查找socket也是分开在这两个hash链表中操作的.
内核是通过调用__inet_lookup来查找socket的:
-
- sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr,
- th->source, iph->daddr, th->dest, inet_iif(skb));
-
- static inline struct sock *__inet_lookup(struct net *net,
- struct inet_hashinfo *hashinfo,
- const __be32 saddr, const __be16 sport,
- const __be32 daddr, const __be16 dport,
- const int dif)
- {
- u16 hnum = ntohs(dport);
- struct sock *sk = __inet_lookup_established(net, hashinfo,
- saddr, sport, daddr, hnum, dif);
-
- return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif);
- }
tcp_hashinfo我们前面也已经分析过了,包含了所有tcp所用到的hash信息,比如socket,port等等.这里的查找其实就是在tcp_hashinfo中(其实是它的域ehash或者listening_hash)查找相应的socket.
我们可以看到内核在这里进行了两次查找,首先是在established状态的socket中查找,处于established状态,说明3次握手已经完成,因此这个socket可以通过简单的4元组hash在hashinfo的ehash中查找.
而当在__inet_lookup_established中没有找到时,则将会__inet_lookup_listener中查找.也就是在处于listening状态的socket中查找(这里主要是通过daddr也就是目的地址来进行匹配).
当找到对应的socket以后就会进入数据包的处理,也就是进入tcp_v4_do_rcv函数.
- int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
- {
- struct sock *rsk;
- ..................................................
-
-
- if (sk->sk_state == TCP_ESTABLISHED) {
- TCP_CHECK_TIMER(sk);
- if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
- rsk = sk;
- goto reset;
- }
- TCP_CHECK_TIMER(sk);
- return 0;
- }
-
-
- if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
- goto csum_err;
-
- if (sk->sk_state == TCP_LISTEN) {
- struct sock *nsk = tcp_v4_hnd_req(sk, skb);
- if (!nsk)
- goto discard;
-
- if (nsk != sk) {
- if (tcp_child_process(sk, nsk, skb)) {
- rsk = nsk;
- goto reset;
- }
- return 0;
- }
- }
-
- TCP_CHECK_TIMER(sk);
-
- if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
- rsk = sk;
- goto reset;
- }
- TCP_CHECK_TIMER(sk);
- return 0;
- ......................................................................
- }
可以看到当进来之后,会通过判断socket的不同状态来进入不同的处理.这里其实就分了3种状态,TCP_ESTABLISHED,TCP_LISTEN和剩余的的状态.
我们这里先不分析TCP_ESTABLISHED.
我们先来看当第一个syn分解到达后,内核会做怎么样处理.首先它会进入tcp_v4_hnd_req函数,这个函数我们后面会处理,这里只需要
知道当为第一个syn分节时,它会返回当前socket.因此此时nsk ==
sk,所以我们进入tcp_rcv_state_process函数,这个函数处理除了ESTABLISHED和TIME_WAIT状态之外的所有状态.
我们这里只看他的listen状态处理,后面的话也是遇到一个状态,我们看一个状态的处理:
- int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
- {
- struct tcp_sock *tp = tcp_sk(sk);
-
- struct inet_connection_sock *icsk = inet_csk(sk);
- int queued = 0;
- tp->rx_opt.saw_tstamp = 0;
-
- switch (sk->sk_state) {
- case TCP_LISTEN:
-
- if (th->ack)
- return 1;
-
- if (th->rst)
- goto discard;
-
- if (th->syn) {
- if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
- return 1;
- kfree_skb(skb);
- return 0;
- }
- goto discard;
- ............................................................
- }
可以看到最终会调用tcp_v4_conn_request来处理syn分组,我们接下来就来看这个函数的实现.
先来看几个相关的函数,第一个是reqsk_queue_is_full,他来判断半连接队列是否已满.其实实现很简单,就是判断qlen和max_qlen_log的大小:
- static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
- {
- return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
- }
第二个是sk_acceptq_is_full,它用来判断accept队列是否已满.这个也是很简单,比较当前的队列大小sk_ack_backlog与最大的队列大小sk_max_ack_backlog.
- static inline int sk_acceptq_is_full(struct sock *sk)
- {
- return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
- }
最后一个是tcp_openreq_init,它用来新建一个inet_request_sock,我们知道每次一个syn到达后,我们都会新建一个inet_request_sock,并加入到半连接队列.
- static inline void tcp_openreq_init(struct request_sock *req,
- struct tcp_options_received *rx_opt,
- struct sk_buff *skb)
- {
- struct inet_request_sock *ireq = inet_rsk(req);
-
- req->rcv_wnd = 0;
- req->cookie_ts = 0;
- tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
- req->mss = rx_opt->mss_clamp;
- req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
- ireq->tstamp_ok = rx_opt->tstamp_ok;
- ireq->sack_ok = rx_opt->sack_ok;
- ireq->snd_wscale = rx_opt->snd_wscale;
- ireq->wscale_ok = rx_opt->wscale_ok;
- ireq->acked = 0;
- ireq->ecn_ok = 0;
- ireq->rmt_port = tcp_hdr(skb)->source;
- }
接下来来看tcp_v4_conn_request的实现,
- int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
- {
- struct inet_request_sock *ireq;
- struct tcp_options_received tmp_opt;
- struct request_sock *req;
- __be32 saddr = ip_hdr(skb)->saddr;
- __be32 daddr = ip_hdr(skb)->daddr;
-
- __u32 isn = TCP_SKB_CB(skb)->when;
- struct dst_entry *dst = NULL;
- #ifdef CONFIG_SYN_COOKIES
- int want_cookie = 0;
- #else
- #define want_cookie 0
- #endif
-
-
- if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
- goto drop;
-
-
- if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
- #ifdef CONFIG_SYN_COOKIES
- if (sysctl_tcp_syncookies) {
- want_cookie = 1;
- } else
- #endif
- goto drop;
- }
-
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
- goto drop;
- req = inet_reqsk_alloc(&tcp_request_sock_ops);
- if (!req)
- goto drop;
- ...................................................
-
-
- tcp_clear_options(&tmp_opt);
- tmp_opt.mss_clamp = 536;
- tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
-
-
- tcp_parse_options(skb, &tmp_opt, 0);
-
- .......................................................
-
-
- tcp_openreq_init(req, &tmp_opt, skb);
- ...............................................
-
-
- ireq->opt = tcp_v4_save_options(sk, skb);
- if (!want_cookie)
- TCP_ECN_create_request(req, tcp_hdr(skb));
-
- if (want_cookie) {
- #ifdef CONFIG_SYN_COOKIES
- syn_flood_warning(skb);
- req->cookie_ts = tmp_opt.tstamp_ok;
- #endif
- isn = cookie_v4_init_sequence(sk, skb, &req->mss);
- }else if (!isn) {
- .............................................
-
- isn = tcp_v4_init_sequence(skb);
- }
-
-
- tcp_rsk(req)->snt_isn = isn;
-
-
- if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
- goto drop_and_free;
-
-
- inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
- return 0;
-
- drop_and_release:
- dst_release(dst);
- drop_and_free:
- reqsk_free(req);
- drop:
- return 0;
- }
而tcp_v4_hnd_req的主要工作是在半连接队列中看是否存在当前的socket,如果存在则说明这个有可能是最终的ack包,因此将会
做一系列的合法性校验(比如重传,rst,syn等等),最终确定这个是ack后会调用对应的新建socket的虚函数syn_recv_sock.
- static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
- {
- struct tcphdr *th = tcp_hdr(skb);
- const struct iphdr *iph = ip_hdr(skb);
- struct sock *nsk;
- struct request_sock **prev;
-
- struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
- iph->saddr, iph->daddr);
- if (req)
-
- return tcp_check_req(sk, skb, req, prev);
-
-
-
- nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
- th->source, iph->daddr, th->dest, inet_iif(skb));
-
- if (nsk) {
- if (nsk->sk_state != TCP_TIME_WAIT) {
-
- bh_lock_sock(nsk);
- return nsk;
- }
-
- inet_twsk_put(inet_twsk(nsk));
- return NULL;
- }
-
- #ifdef CONFIG_SYN_COOKIES
- if (!th->rst && !th->syn && th->ack)
- sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
- #endif
- return sk;
- }
tcp_check_req最主要工作就是调用虚函数,新建一个socket,并返回.
先来看几个相关的函数,第一个是inet_csk_reqsk_queue_unlink,它主要用来从半连接队列unlink掉一个元素.:
- static inline void inet_csk_reqsk_queue_unlink(struct sock *sk,
- struct request_sock *req,
- struct request_sock **prev)
- {
- reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req, prev);
- }
-
- static inline void reqsk_queue_unlink(struct request_sock_queue *queue,
- struct request_sock *req,
- struct request_sock **prev_req)
- {
- write_lock(&queue->syn_wait_lock);
-
- *prev_req = req->dl_next;
- write_unlock(&queue->syn_wait_lock);
- }
第二个是inet_csk_reqsk_queue_removed,它主要用来修改对应的qlen和qlen_young的值.
- static inline void inet_csk_reqsk_queue_removed(struct sock *sk,
- struct request_sock *req)
- {
- if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0)
- inet_csk_delete_keepalive_timer(sk);
- }
-
- static inline int reqsk_queue_removed(struct request_sock_queue *queue,
- struct request_sock *req)
- {
- struct listen_sock *lopt = queue->listen_opt;
-
- if (req->retrans == 0)
- --lopt->qlen_young;
-
- return --lopt->qlen;
- }
最后是inet_csk_reqsk_queue_add,它用来把新的req加入到accept队列中.
- static inline void inet_csk_reqsk_queue_add(struct sock *sk,
- struct request_sock *req,
- struct sock *child)
- {
- reqsk_queue_add(&inet_csk(sk)->icsk_accept_queue, req, sk, child);
- }
-
-
- static inline void reqsk_queue_add(struct request_sock_queue *queue,
- struct request_sock *req,
- struct sock *parent,
- struct sock *child)
- {
- req->sk = child;
- sk_acceptq_added(parent);
-
- if (queue->rskq_accept_head == NULL)
- queue->rskq_accept_head = req;
- else
- queue->rskq_accept_tail->dl_next = req;
-
- queue->rskq_accept_tail = req;
- req->dl_next = NULL;
- }
然后再来看tcp_check_req的实现.
- struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
- struct request_sock *req,
- struct request_sock **prev)
- {
- const struct tcphdr *th = tcp_hdr(skb);
- __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
- int paws_reject = 0;
- struct tcp_options_received tmp_opt;
- struct sock *child;
-
- tmp_opt.saw_tstamp = 0;
- ......................................
-
- if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
- goto embryonic_reset;
- }
-
-
- if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
- flg == TCP_FLAG_SYN &&
- !paws_reject) {
- req->rsk_ops->rtx_syn_ack(sk, req);
- return NULL;
- }
-
- ..........................................
-
-
- if (!(flg & TCP_FLAG_ACK))
- return NULL;
-
-
- if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
- TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
- inet_rsk(req)->acked = 1;
- return NULL;
- }
-
-
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
- if (child == NULL)
- goto listen_overflow;
- ..................................
- #endif
-
- inet_csk_reqsk_queue_unlink(sk, req, prev);
-
- inet_csk_reqsk_queue_removed(sk, req);
-
- inet_csk_reqsk_queue_add(sk, req, child);
- return child;
-
- listen_overflow:
- if (!sysctl_tcp_abort_on_overflow) {
- inet_rsk(req)->acked = 1;
- return NULL;
- }
-
- embryonic_reset:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
- if (!(flg & TCP_FLAG_RST))
- req->rsk_ops->send_reset(sk, skb);
-
- inet_csk_reqsk_queue_drop(sk, req, prev);
- return NULL;
- }
最后我们来看内核如何创建一个新的socket,tcp
协议使用tcp_v4_syn_recv_sock来实现,它做的其实很简单就是新建一个socket,并且设置状态为TCP_SYN_RECV(在
inet_csk_clone中),父socket继续处于listen状态,然后对新的socket进行一些赋值,然后对一些定时器进行初始化.这里定
时器我们全部都略过了,以后会专门来分析tcp中的定时器.
最后从tcp_v4_hnd_req中返回,判断是否与父socket相等,然后调用tcp_child_process函数:
这个函数主要是完成最终的三次握手,将子socket设置为TCP_ESTABLISHED然后根据条件唤醒被accept阻塞的主socket:
- int tcp_child_process(struct sock *parent, struct sock *child,
- struct sk_buff *skb)
- {
- int ret = 0;
- int state = child->sk_state;
-
- if (!sock_owned_by_user(child)) {
-
- ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
- skb->len);
-
- if (state == TCP_SYN_RECV && child->sk_state != state)
-
- parent->sk_data_ready(parent, 0);
- } else {
-
-
-
-
- sk_add_backlog(child, skb);
- }
-
- bh_unlock_sock(child);
- sock_put(child);
- return ret;
- }
最后来分析下在tcp_rcv_state_process中的处理当前的TCP_SYN_RECV状态,它主要是为将要到来的数据传输做一些准备,设置一些相关域.:
- case TCP_SYN_RECV:
- if (acceptable) {
- tp->copied_seq = tp->rcv_nxt;
- smp_mb();
-
- tcp_set_state(sk, TCP_ESTABLISHED);
- sk->sk_state_change(sk);
-
-
- if (sk->sk_socket)
- sk_wake_async(sk,
- SOCK_WAKE_IO, POLL_OUT);
-
-
- tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
- tp->snd_wnd = ntohs(th->window) <<
- tp->rx_opt.snd_wscale;
- tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq,
- TCP_SKB_CB(skb)->seq);
-
- .........................................................................
- break;
先来看下accept的实现.
其实accept的作用很简单,就是从accept队列中取出三次握手完成的socket,并将它关联到vfs上(其实操作和调用
sys_socket时新建一个socket类似).然后返回.这里还有个要注意的,如果这个传递给accept的socket是非阻塞的话,就算
accept队列为空,也会直接返回,而是阻塞的话就会休眠掉,等待accept队列有数据后唤醒他.
接下来我们就来看它的实现,accept对应的系统调用是 sys_accept,而他则会调用do_accept,因此我们直接来看do_accept:
- long do_accept(int fd, struct sockaddr __user *upeer_sockaddr,
- int __user *upeer_addrlen, int flags)
- {
- struct socket *sock, *newsock;
- struct file *newfile;
- int err, len, newfd, fput_needed;
- struct sockaddr_storage address;
- .............................................
-
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (!sock)
- goto out;
-
- err = -ENFILE;
-
- if (!(newsock = sock_alloc()))
- goto out_put;
-
- newsock->type = sock->type;
- newsock->ops = sock->ops;
-
-
-
-
-
- __module_get(newsock->ops->owner);
-
- newfd = sock_alloc_fd(&newfile, flags & O_CLOEXEC);
- if (unlikely(newfd < 0)) {
- err = newfd;
- sock_release(newsock);
- goto out_put;
- }
-
- err = sock_attach_fd(newsock, newfile, flags & O_NONBLOCK);
- if (err < 0)
- goto out_fd_simple;
-
- err = security_socket_accept(sock, newsock);
- if (err)
- goto out_fd;
-
- err = sock->ops->accept(sock, newsock, sock->file->f_flags);
- if (err < 0)
- goto out_fd;
-
- if (upeer_sockaddr) {
- if (newsock->ops->getname(newsock, (struct sockaddr *)&address,
- &len, 2) < 0) {
- err = -ECONNABORTED;
- goto out_fd;
- }
- err = move_addr_to_user((struct sockaddr *)&address,
- len, upeer_sockaddr, upeer_addrlen);
- if (err < 0)
- goto out_fd;
- }
-
-
-
- fd_install(newfd, newfile);
- err = newfd;
-
- security_socket_post_accept(sock, newsock);
-
- out_put:
-
- fput_light(sock->file, fput_needed);
- out:
-
- return err;
- .......................................
- }
可以看到流程很简单,最终的实现都集中在inet_accept中了.而inet_accept主要做的就是
1 调用inet_csk_accept来进行对accept队列的操作.它会返回取得的sock.
2 将从inet_csk_accept返回的sock链接到传递进来的(也就是在do_accept中new的socket)中.这里就知道我们上面为什么只需要new一个socket而不是sock了.因为sock我们是直接从accept队列中取得的.
3 设置新的socket的状态为SS_CONNECTED.
- int inet_accept(struct socket *sock, struct socket *newsock, int flags)
- {
- struct sock *sk1 = sock->sk;
- int err = -EINVAL;
-
- struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
-
- if (!sk2)
- goto do_err;
-
- lock_sock(sk2);
-
- WARN_ON(!((1 << sk2->sk_state) &
- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
-
- sock_graft(sk2, newsock);
-
- newsock->state = SS_CONNECTED;
- err = 0;
- release_sock(sk2);
- do_err:
- return err;
- }
inet_csk_accept就是从accept队列中取出sock然后返回.
在看他的源码之前先来看几个相关函数的实现:
首先是reqsk_queue_empty,他用来判断accept队列是否为空:
- static inline int reqsk_queue_empty(struct request_sock_queue *queue)
- {
- return queue->rskq_accept_head == NULL;
- }
然后是reqsk_queue_get_child,他主要是从accept队列中得到一个sock:
- static inline struct sock *reqsk_queue_get_child(struct request_sock_queue *queue,
- struct sock *parent)
- {
-
- struct request_sock *req = reqsk_queue_remove(queue);
-
- struct sock *child = req->sk;
-
- WARN_ON(child == NULL);
-
- sk_acceptq_removed(parent);
- __reqsk_free(req);
- return child;
- }
这里还有一个inet_csk_wait_for_connect,它是用来在accept队列为空的情况下,休眠掉一段时间
(这里每个socket都有一个等待队列的(等待队列的用法请google,我这里就不阐述了).这里是每个调用的进程都会声明一个wait队列,然后将
它连接到主的socket的等待队列链表中,然后休眠,等到唤醒.
- static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
- {
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- DEFINE_WAIT(wait);
- int err;
- ..................................................
- for (;;) {
-
- prepare_to_wait_exclusive(sk->sk_sleep, &wait,
- TASK_INTERRUPTIBLE);
- release_sock(sk);
-
- if (reqsk_queue_empty(&icsk->icsk_accept_queue))
-
- timeo = schedule_timeout(timeo);
- lock_sock(sk);
- err = 0;
-
- if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
- break;
- err = -EINVAL;
- if (sk->sk_state != TCP_LISTEN)
- break;
- err = sock_intr_errno(timeo);
- if (signal_pending(current))
- break;
-
- err = -EAGAIN;
-
- if (!timeo)
- break;
- }
-
- finish_wait(sk->sk_sleep, &wait);
- return err;
- }
然后来看inet_csk_accept的源码,这里有个阻塞和非阻塞的问题.非阻塞的话会直接返回的,就算accept队列为空.这个时侯设置errno为-EAGAIN.
- struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
- {
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct sock *newsk;
- int error;
-
- lock_sock(sk);
-
-
-
-
- error = -EINVAL;
-
- if (sk->sk_state != TCP_LISTEN)
- goto out_err;
-
-
- if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
-
- long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
-
-
- error = -EAGAIN;
- if (!timeo)
- goto out_err;
-
- error = inet_csk_wait_for_connect(sk, timeo);
- if (error)
- goto out_err;
- }
-
- newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
- WARN_ON(newsk->sk_state == TCP_SYN_RECV);
- out:
- release_sock(sk);
- return newsk;
- out_err:
- newsk = NULL;
- *err = error;
- goto out;
- }
最后来大概分析下connect的实现.它的具体流程是:
1 由fd得到socket,并且将地址复制到内核空间
2 调用inet_stream_connect进行主要的处理.
这里要注意connect也有个阻塞和非阻塞的区别,阻塞的话调用inet_wait_for_connect休眠,等待握手完成,否则直接返回.
- asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr,
- int addrlen)
- {
- struct socket *sock;
- struct sockaddr_storage address;
- int err, fput_needed;
-
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (!sock)
- goto out;
-
- err = move_addr_to_kernel(uservaddr, addrlen, (struct sockaddr *)&address);
- if (err < 0)
- goto out_put;
-
- err =
- security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
- if (err)
- goto out_put;
-
- err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
- sock->file->f_flags);
- out_put:
- fput_light(sock->file, fput_needed);
- out:
- return err;
- }
然后来看inet_stream_connect,他的主要工作是:
1 判断socket的状态.只有当为SS_UNCONNECTED也就是非连接状态时才调用tcp_v4_connect来进行连接处理.
2 判断tcp的状态sk_state只能为TCPF_SYN_SENT或者TCPF_SYN_RECV,才进入相关处理.
3 如果状态合适并且socket为阻塞模式则调用inet_wait_for_connect进入休眠等待握手完成,否则直接返回,并设置错误号为EINPROGRESS.
- int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
- int addr_len, int flags)
- {
- struct sock *sk = sock->sk;
- int err;
- long timeo;
-
- lock_sock(sk);
- ............................................
-
- switch (sock->state) {
- default:
- err = -EINVAL;
- goto out;
- case SS_CONNECTED:
- err = -EISCONN;
- goto out;
- case SS_CONNECTING:
- err = -EALREADY;
-
- break;
- case SS_UNCONNECTED:
- err = -EISCONN;
- if (sk->sk_state != TCP_CLOSE)
- goto out;
-
- err = sk->sk_prot->connect(sk, uaddr, addr_len);
- if (err < 0)
- goto out;
-
- sock->state = SS_CONNECTING;
-
- err = -EINPROGRESS;
- break;
- }
-
- timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
-
- if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
-
- if (!timeo || !inet_wait_for_connect(sk, timeo))
- goto out;
-
- err = sock_intr_errno(timeo);
- if (signal_pending(current))
- goto out;
- }
-
-
-
-
- if (sk->sk_state == TCP_CLOSE)
- goto sock_error;
-
- sock->state = SS_CONNECTED;
- err = 0;
- out:
- release_sock(sk);
- return err;
-
- sock_error:
- err = sock_error(sk) ? : -ECONNABORTED;
- sock->state = SS_UNCONNECTED;
- if (sk->sk_prot->disconnect(sk, flags))
- sock->state = SS_DISCONNECTING;
- goto out;
- }
tcp_v4_connect的源码就不分析了,我这里只大概的介绍下他的流程:
1 判断地址的一些合法性.
2 调用ip_route_connect来查找出去的路由(包括查找临时端口等等).
3 设置sock的状态为TCP_SYN_SENT,并调用inet_hash_connect来查找一个临时端口(也就是我们出去的端口),并加入到对应的hash链表(具体操作和get_port很相似).
4
调用tcp_connect来完成最终的操作.这个函数主要用来初始化将要发送的syn包(包括窗口大小isn等等),然后将这个sk_buffer加入
到socket的写队列.最终调用tcp_transmit_skb传输到3层.再往下的操作就可以看我前面的blog了.
最后来看下3次握手的客户端的状态变化,还是看tcp_rcv_state_process函数,这里我们进来的socket假设就是TCP_SYN_SENT状态,也就是在等待syn和ack分节:
- int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
- {
- ..........................................
-
- switch (sk->sk_state) {
- case TCP_CLOSE:
- goto discard;
-
- case TCP_LISTEN:
- ..................................
-
- case TCP_SYN_SENT:
-
- queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
- if (queued >= 0)
- return queued;
-
-
- tcp_urg(sk, skb, th);
- __kfree_skb(skb);
- tcp_data_snd_check(sk);
- return 0;
- }
然后来看tcp_rcv_synsent_state_process中的状态变化:
- static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
- {
- ..................
-
- if (th->ack) {
- ....................................
-
- if (th->rst) {
- tcp_reset(sk);
- goto discard;
- }
-
- if (!th->syn)
- goto discard_and_undo;
-
- ..................................................
-
- tcp_set_state(sk, TCP_ESTABLISHED);
-
- .......................................
- }
-
- ....................................................
-
- if (th->syn) {
-
- tcp_set_state(sk, TCP_SYN_RECV);
-
- ...................................
-
- tcp_send_synack(sk);
- goto discard;
- #endif
- }
- ...................
- }
这里如果只接受到syn,则三次握手还没完成,我们还在等待最后一个ack,因此此时有数据报的话,会再次落入tcp_rcv_state_process函数:
- if (th->ack) {
-
- int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
-
- switch (sk->sk_state) {
- case TCP_SYN_RECV:
- if (acceptable) {
-
- tp->copied_seq = tp->rcv_nxt;
- smp_mb();
-
- tcp_set_state(sk, TCP_ESTABLISHED);
- sk->sk_state_change(sk);
-
- if (sk->sk_socket)
- sk_wake_async(sk,
- SOCK_WAKE_IO, POLL_OUT);
-
- ........................................
- } else {
- return 1;
- }
- break;