本文主要分析:三次握手中最后一个ACK段到达时,服务器端的处理路径。
内核版本:3.6
Author:zhangskd @ csdn blog
创建新sock
协议族相关的操作函数,我们要看的是TCP/IPv4的实例ipv4_specific。
-
const struct inet_connection_sock_af_ops ipv4_specific = {
-
...
-
.conn_request = tcp_v4_conn_request,
-
.syn_recv_sock = tcp_v4_syn_recv_sock,
-
...
-
};
三次握手完成以后,要为新的连接创建一个传输控制块,并初始化传输控制块。
一个TCP传输控制块是由多层组成的,包括:
tcp_sock
inet_connection_sock
inet_sock
sock
sock_common
所以,初始化要做的工作比较多。
-
-
-
-
-
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req,
-
struct dst_entry *dst)
-
{
-
struct inet_request_sock *ireq;
-
struct inet_sock *newinet;
-
struct tcp_sock *newtp;
-
struct sock *newsk;
-
#ifdef CONFIG_TCP_MD5SIG
-
struct tcp_md5sig_key *key;
-
#endif
-
struct ip_options_rcu *inet_opt;
-
-
-
if (sk_acceptq_is_full(sk))
-
goto exit_overflow;
-
-
-
newsk = tcp_create_openreq_child(sk, req, skb);
-
if (! newsk)
-
goto exit_nonewsk;
-
-
newsk->sk_gso_type = SKB_GSO_TCPV4;
-
inet_sk_rx_dst_set(newsk, skb);
-
-
newtp = tcp_sk(newsk);
-
newinet = inet_sk(newsk);
-
ireq = inet_rsk(req);
-
newinet->inet_daddr = ireq->rmt_addr;
-
newinet->inet_rcv_saddr = ireq->loc_addr;
-
newinet->inet_saddr = ireq->loc_addr;
-
inet_opt = ireq->opt;
-
rcu_assign_pointer(newinet->inet_opt, inet_opt);
-
ireq->opt = NULL;
-
-
newinet->mc_index = inet_iif(skb);
-
newinet->mc_ttl = ip_hdr(skb)->ttl;
-
newinet->rcv_tos = ip_hdr(skb)->tos;
-
inet_csk(newsk)->icsk_ext_hdr_len = 0;
-
if (inet_opt)
-
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
-
newinet->inet_id = newtp->write_seq ^ jiffies;
-
-
if (! dst) {
-
dst = inet_csk_route_child_sock(sk, newsk, req);
-
if (! dst)
-
goto put_and_exit;
-
} else {
-
-
}
-
sk_setup_caps(newsk, dst);
-
-
tcp_mtup_init(newsk);
-
tcp_sync_mss(newsk, dst_mtu(dst));
-
newtp->advmss = dst_metric_advmss(dst);
-
if (tcp_sk(sk)->rx_opt.user_mss && tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
-
newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
-
tcp_initialize_rcv_mss(newsk);
-
-
if (tcp_rsk(req)->snt_synack)
-
tcp_valid_rtt_meas(newsk, tcp_time_stamp - tcp_rsk(req)->snt_synack);
-
newtp->total_retrans = req->retrans;
-
-
#ifdef CONFIG_TCP_MD5SIG
-
-
key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) &newinet->inet_daddr, AF_INET);
-
if (key != NULL) {
-
-
-
-
tcp_md5_do_add(newsk, (union tcp_md5_addr *) &newinet->inet_daddr, AF_INET,
-
key->key, key->keylen, GFP_ATOMIC);
-
sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
-
}
-
#endif
-
-
-
if (__inet_inherit_port(sk, newsk) < 0)
-
goto put_and_eixt;
-
-
-
__inet_hash_nolisten(newsk, NULL);
-
-
return newsk;
-
-
exit_overflow:
-
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
-
exit_nonewsk:
-
dst_release(dst);
-
exit:
-
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
-
return NULL;
-
-
put_and_exit:
-
tcp_clear_xmit_timers(newsk);
-
tcp_cleanup_congestion_control(newsk);
-
bh_unlock_sock(newsk);
-
sock_put(newsk);
-
goto exit;
-
}
根据监听传输控制块sock、连接请求块req,为新的连接创建一个传输控制块sock。
初始化此传输控制块对应的inet_sock、inet_connection_sock、tcp_sock结构中的变量。
-
struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
-
{
-
-
struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
-
-
if (newsk != NULL) {
-
cosnt struct inet_request_sock *ireq = inet_rsk(req);
-
struct tcp_request_sock *treq = tcp_rsk(req);
-
struct inet_connection_sock *newicsk = inet_csk(newsk);
-
struct tcp_sock *newtp = tcp_sk(newsk);
-
struct tcp_sock *oldtp = tcp_sk(sk);
-
struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
-
-
-
if (oldcvp != NULL) { ... }
-
-
-
newtp->pred_flags = 0;
-
-
-
newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
-
newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up
-
= treq->snt_isn + 1 + tcp_s_data_size(oldtp);
-
-
tcp_prequeue_init(newtp);
-
INIT_LIST_HEAD(&newtp->tsq_node);
-
tcp_init_wl(newtp, treq->rcv_isn);
-
-
-
newtp->srtt = 0;
-
newtp->mdev = TCP_TIMEOUT_INIT;
-
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
-
-
-
newtp->packets_out = 0;
-
newtp->retrans_out = 0;
-
newtp->sacked_out = 0;
-
newtp->fackets_out = 0;
-
newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-
tcp_enable_early_retrans(newtp);
-
-
newtp->snd_cwnd = TCP_INIT_CWND;
-
newtp->snd_cwnd_cnt = 0;
-
newtp->bytes_acked = 0;
-
newtp->frto_counter = 0;
-
newtp->frto_highmark = 0;
-
-
-
-
-
if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
-
! try_module_get(newicsk->icsk_ca_ops->owner))
-
newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
-
-
tcp_set_ca_state(newsk, TCP_CA_Open);
-
tcp_init_xmit_timers(newsk);
-
skb_queue_head_init(&newtp->out_of_order_queue);
-
newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1 + tcp_s_data_size(oldtp);
-
-
-
newtp->rx_opt.saw_tstamp = 0;
-
newtp->rx_opt.dsack = 0;
-
newtp->rx_opt.num_sacks = 0;
-
newtp->urg_data = 0;
-
-
-
if (sock_flag(newsk, SOCK_KEEPOPEN))
-
inet_csk_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
-
-
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
-
if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
-
if (sysctl_tcp_fack)
-
tcp_enable_fack(newtp);
-
}
-
-
newtp->window_clamp = req->window_clamp;
-
newtp->rcv_ssthresh = req->rcv_wnd;
-
newtp->rcv_wnd = req->rcv_wnd;
-
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
-
if (newtp->rx_opt.wscale_ok) {
-
newtp->rx_opt.snd_wscale = ireq->snd_wscale;
-
newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
-
} else {
-
newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
-
newtp->window_clamp = min(newtp->window_clamp, 65535U);
-
}
-
-
newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale);
-
newtp->max_window = newtp->snd_wnd;
-
-
if (newtp->rx_opt.tstamp_ok) {
-
newtp->rx_opt.ts_recent = req->ts_recent;
-
newtp->rx_opt.ts_recent_stamp = get_seconds();
-
newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
-
} else {
-
newtp->rx_opt.ts_recent_stamp = 0;
-
newtp->tcp_header_len = sizeof(struct tcphdr);
-
}
-
-
#ifdef CONFIG_TCP_MD5SIG
-
newtp->md5sig_info = NULL;
-
if (newtp->af_specific->md5_lookup(sk, newsk))
-
newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
-
#endif
-
-
if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
-
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
-
-
newtp->rx_opt.mss_clamp = req->mss;
-
TCP_ECN_openreq_child(newtp, req);
-
-
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
-
}
-
-
return newsk;
-
}
克隆一个传输控制块,并对新的传输控制块上锁。
-
-
-
-
-
-
-
-
struct sock *inet_csk_clone_lock(cons struct sock *sk, const struct request_sock *req, const gfp_t priority)
-
{
-
struct sock *newsk = sk_clone_lock(sk, priority);
-
-
if (newsk != NULL) {
-
struct inet_connection_sock *newicsk = inet_csk(newsk);
-
newsk->sk_state = TCP_SYN_RECV;
-
newicsk->icsk_bind_hash = NULL;
-
-
inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port;
-
inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port);
-
inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
-
newsk->sk_write_space = sk_stream_write_space;
-
-
newicsk->icsk_retransmits = 0;
-
newicsk->icsk_backoff = 0;
-
newicsk->icsk_probes_out = 0;
-
-
memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
-
security_inet_csk_clone(newsk, req);
-
}
-
-
return newsk;
-
}
把newsk链入使用端口的哈希链表中,更新端口的统计信息。
-
int __inet_inherit_port(struct sock *sk, struct sock *child)
-
{
-
struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
-
unsigned short port = inet_sk(child)->inet_num;
-
const int bhash = inet_bhashfn(sock_net(sk), port, table->bhash_size);
-
struct inet_bind_hashbucket *head = &table->bhash[bhash];
-
struct inet_bind_bucket *tb;
-
-
spin_lock(&head->lock);
-
tb = inet_csk(sk)->icsk_bind_hash;
-
-
if (tb->port != port) {
-
-
-
-
-
-
struct hlist_node *node;
-
-
inet_bind_bucket_for_each(tb, node, &head->chain) {
-
if (net_eq(ib_net(tb), sock_net(sk)) && tb->port == port)
-
break;
-
}
-
-
if (! node) {
-
-
tb = inet_bind_bucket_create(table->bind_bucket_cachep, sock_net(sk), head, port);
-
if (! tb) {
-
spin_unlock(&head->lock);
-
return -ENOMEM;
-
}
-
}
-
}
-
-
inet_bind_hash(child, tb, port);
-
spin_unlock(&head->lock);
-
-
return 0;
-
}
-
-
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum)
-
{
-
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-
atomic_inc(&hashinfo->bsockets);
-
inet_sk(sk)->inet_num = snum;
-
sk_add_bind_node(sk, &tb->owners);
-
tb->num_owners++;
-
inet_csk(sk)->icsk_bind_hash = tb;
-
}
把newsk链入ESTABLISHED状态的哈希表中。
-
int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
-
{
-
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-
struct hlist_nulls_head *list;
-
spinlock_t *lock;
-
struct inet_ehash_bucket *head;
-
int twrefcnt = 0;
-
-
WARN_ON(! sk_unhashed(sk));
-
sk->sk_hash = inet_sk_ehashfn(sk);
-
-
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
-
list = &head->chain;
-
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
-
-
spin_lock(lock);
-
__sk_nulls_add_node_rcu(sk, list);
-
-
if (tw) {
-
WARN_ON(sk->sk_hash != tw->tw_hash);
-
twrefcnt = inet_twsk_unhash(tw);
-
}
-
spin_unlock(lock);
-
-
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
-
-
return twrefcnt;
-
}
-
-
static inline int inet_sk_ehashfn(const struct sock *sk)
-
{
-
-
const struct inet_sock *inet = inet_sk(sk);
-
-
const __be32 laddr = inet->inet_rcv_saddr;
-
const __u16 lport = inet->inet_num;
-
const __be32 faddr = inet->inet_daddr;
-
const __be16 fport = inet->inet_dport;
-
struct net *net = sock_net(sk);
-
-
return inet_ehashfn(net, laddr, lport, faddr, fport);
-
}
唤醒监听进程
调用tcp_child_process()来做最后的处理:
1. tcp_ack()处理接收到的ACK,更新child的状态为ESTABLISHED。
唤醒child上的等待进程,初始化子传输控制块的一些字段。
2. 唤醒监听sock上的等待进程,以便监听进程执行accept()。
3. 如果child被用户进程占用,则先把ACK段添加到backlog队列中。
-
-
-
-
-
int tcp_child_process(struct sock *parent, struct sock *child, sk_buff *skb)
-
{
-
int ret = 0;
-
int state = child->sk_state;
-
-
-
if (! sock_owned_by_user(child)) {
-
-
-
-
ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
-
-
-
-
-
if (state == TCP_SYN_RECV && child->sk_state != state)
-
parent->sk_data_ready(parent, 0);
-
-
} else {
-
__sk_add_backlog(child, skb);
-
}
-
-
bh_unlock_sock(child);
-
sock_put(child);
-
return ret;
-
}
把数据包添加到backlog队列中。
-
static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
-
{
-
-
skb_dst_force(skb);
-
-
-
if (! sk->sk_backlog.tail)
-
sk->sk_backlog.head = skb;
-
else
-
sk->sk_backlog.tail->next = skb;
-
-
sk->sk_backlog.tail = skb;
-
skb->next = NULL;
-
}
子传输控制块调用tcp_ack()处理收到的ACK,把子传输控制块的状态从TCP_SYN_RECV更新为TCP_ESTABLISHED,
并唤醒子传输控制块上的等待进程,更新子传输控制块的一些字段。
-
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len)
-
{
-
struct tcp_sock *tp = tcp_sk(sk);
-
struct inet_connection_sock *icsk = inet_csk(sk);
-
int queued = 0;
-
-
tp->rx_opt.saw_tstamp = 0;
-
-
switch(sk->sk_state) {
-
...
-
}
-
-
if (! tcp_validate_incoming(sk, skb, th, 0))
-
return 0;
-
-
-
if (th->ack) {
-
int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
-
-
switch (sk->sk_state) {
-
case TCP_SYN_RECV:
-
if (acceptable) {
-
tp->copied_seq = tp->rcv_nxt;
-
smp_mb();
-
-
-
tcp_set_state(sk, TCP_ESTABLISHED);
-
-
sk->sk_state_change(sk);
-
-
-
-
-
-
if (sk->sk_socket)
-
sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
-
-
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
-
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
-
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
-
if (tp->rx_opt.tstamp_ok)
-
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
-
-
-
icsk->icsk_af_ops->rebuild_header(sk);
-
-
tcp_init_metrics(sk);
-
tcp_init_congestion_control(sk);
-
-
-
tp->lsndtime = tcp_time_stamp;
-
-
tcp_mtup_init(sk);
-
tcp_initialize_rcv_mss(sk);
-
tcp_init_buffer_space(sk);
-
tcp_fast_path_on(tp);
-
} else
-
return 1;
-
-
break;
-
...
-
}
-
} else
-
goto discard;
-
...
-
discard:
-
__kfree_skb(skb);
-
}
-
return 0;
-
}
-
static void sock_def_wakeup(struct sock *sk)
-
{
-
struct socket_wq *wq;
-
rcu_read_lock();
-
-
wq = rcu_dereference(sk->sk_wq);
-
if (wq_has_sleeper(wq))
-
wake_up_interruptible_all(&wq->wait);
-
rcu_read_unlock();
-
}
-
-
-
static inline bool wq_has_sleeper(struct socket_wq *wq)
-
{
-
smp_mb();
-
return wq && waitqueue_active(&wq->wait);
-
}
-
-
static inline int waitqueue_active(wait_queue_head_t *q)
-
{
-
return ! list_empty(&q->task_list);
-
}
-
-
#define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
-
-
void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key)
-
{
-
unsigned long flags;
-
spin_lock_irqsave(&q->lock, flags);
-
__wake_up_common(q, mode, nr_exclusive, 0, key);
-
spin_unlock_irqrestore(&q->lock, flags);
-
}
-
-
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive,
-
int wake_flags, void *key)
-
{
-
wait_queue_t *curr, *next;
-
-
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
-
unsigned flags = curr->flags;
-
-
if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
-
break;
-
}
-
}
阅读(2247) | 评论(0) | 转发(0) |