内核中的TCP的追踪分析－2-追踪TCP（IPV4)的socket的创建-qinjiana0786-ChinaUnix博客

无名

首页　| 　博文目录　| 　关于我

qinjiana0786

博客访问： 1330419
博文数量： 107
博客积分： 10155
博客等级：上将
技术积分： 2166
用户组：普通用户
注册时间： 2008-03-25 16:57

文章分类

全部博文（107）

如何从应用程序进（24）
allegro（38）
LINUX（11）
TCP/IP（33）
未分配的博文（1）

文章存档

2010年（1）

2009年（1）

2008年（105）

我的朋友

相关博文

内核中的TCP的追踪分析－2-追踪TCP（IPV4)的socket的创建

分类： LINUX

2008-11-05 10:34:32

今天真是不幸，写好的文章竟然在编辑器里被错误冲刷掉了，本来已经完成了本章节的内容，就要在最后关头出现了一个IE错误，现在是重写的，请朋友们见谅。我是无名小卒，请转载的朋友注明出处，谢谢。昨天我们看到了inet_create()函数我们今天继续。

static int inet_create(struct net *net, struct socket *sock, int protocol) { struct sock *sk; struct list_head *p; struct inet_protosw *answer; struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; char answer_no_check; int try_loading_module = 0; int err; if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM && !inet_ehash_secret) build_ehash_secret(); sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ answer = NULL;

这个函数首先是检查是不是原始的socket和udp的socket，并且判断是否已经有了加密字符如果没有就会调用build_ehash_secret来分配一个

void build_ehash_secret(void) { u32 rnd; do { get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); spin_lock_bh(&inetsw_lock); if (!inet_ehash_secret) inet_ehash_secret = rnd; spin_unlock_bh(&inetsw_lock); }

get_random_bytes是取得一个随机数即“熵”，取得后赋值给加密字符使用。然后上面的函数中将socket设置为未连接状态。我们继续往下看

lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_rcu(p, &inetsw[sock->type]) { answer = list_entry(p, struct inet_protosw, list); /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } err = -EPROTONOSUPPORT; answer = NULL; } if (unlikely(answer == NULL)) { if (try_loading_module < 2) { rcu_read_unlock(); /* * Be more specific, e.g. net-pf-2-proto-132-type-1 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module == 1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET, protocol, sock->type); /* * Fall back to generic, e.g. net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET, protocol); goto lookup_protocol; } else goto out_rcu_unlock; } err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; err = -EAFNOSUPPORT; if (!inet_netns_ok(net, protocol)) goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; answer_no_check = answer->no_check; answer_flags = answer->flags; rcu_read_unlock(); BUG_TRAP(answer_prot->slab != NULL); err = -ENOBUFS; sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); if (sk == NULL) goto out; err = 0; sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = 1; inet = inet_sk(sk); inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; if (SOCK_RAW == sock->type) { inet->num = protocol; if (IPPROTO_RAW == protocol) inet->hdrincl = 1; } if (ipv4_config.no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; inet->id = 0; sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct; sk->sk_family = PF_INET; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; inet->uc_ttl = -1; inet->mc_loop = 1; inet->mc_ttl = 1; inet->mc_index = 0; inet->mc_list = NULL; sk_refcnt_debug_inc(sk); if (inet->num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ inet->sport = htons(inet->num); /* Add to protocol hash chains. */ sk->sk_prot->hash(sk); } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); if (err) sk_common_release(sk); } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; }

这段代码看似复杂其实分析起来并不算难，首先上面

list_for_each_rcu(p, &inetsw[sock->type])是一个宏，我们看一下

#define list_for_each_rcu(pos, head) \ for (pos = rcu_dereference((head)->next); \ prefetch(pos->next), pos != (head); \ pos = rcu_dereference(pos->next))

这段宏我贴些资料供大家理解，下面这些内容出自

RCU（Read-Copy Update）通过延迟写操作来提高同步性能，具体请参见第3章。这里只分析具有RCU的链表。 RCU常用来保护读操作占多数的链表与数组。具有RCU的链表的操作函数与普通链表操作函数的区别是在函数名后加上了_rcu，如list_for_each_rcu函数。函数list_for_each_rcu的功能是遍历一个rcu保护的链表。其中，参数pos表示用来做链表位置计数的&struct list_head结构，参数head表示链表头。只要遍历被rcu_read_lock()保护，使用诸如list_add_rcu()的函数对链表同时访问是安全的。函数List_for_each_rcu列出如下： #define list_for_each_rcu(pos, head) \ for (pos = (head)->next, prefetch(pos->next); pos != (head); \ pos = rcu_dereference(pos->next), prefetch(pos->next)) 函数rcu_dereference在RCU读临界部分中取出一个RCU保护的指针。在需要内存屏障的体系中进行内存屏障（目前只有Alpha体系需要），函数列出如下： #define rcu_dereference(p) ({ \ typeof(p) _________p1 = p; \ smp_read_barrier_depends(); \ (_________p1); \ }) 在include/asm-i386/system.h中： #define smp_read_barrier_depends() read_barrier_depends()

很明显上面的宏就是循环检查inetsw数组找到符合我们socket类型的链头，那么这个数组是什么时候初始化的呢？我们再象上一节那样看一下

static int __init inet_init(void) { 。。。。。。 for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) inet_register_protosw(q); 。。。。。。 }

我们看到他循环用inet_register_protosw()函数处理inetsw_array数组中的元素

static struct inet_protosw inetsw_array[] = { { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .capability = -1, .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, .capability = -1, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }, { .type = SOCK_RAW, .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, .capability = CAP_NET_RAW, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, } };

我们结合应用程序的练习看一下

server_sockfd = socket(AF_INET, SOCK_STREAM, 0);

这里我们看到在上面的数组中第一个元素就是我们需要的，他是被下面的函数登记到数组中的

void inet_register_protosw(struct inet_protosw *p) { struct list_head *lh; struct inet_protosw *answer; int protocol = p->protocol; struct list_head *last_perm; spin_lock_bh(&inetsw_lock); if (p->type >= SOCK_MAX) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ answer = NULL; last_perm = &inetsw[p->type]; list_for_each(lh, &inetsw[p->type]) { answer = list_entry(lh, struct inet_protosw, list); /* Check only the non-wild match. */ if (INET_PROTOSW_PERMANENT & answer->flags) { if (protocol == answer->protocol) break; last_perm = lh; } answer = NULL; } if (answer) goto out_permanent; /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with * a wild-card protocol. But it is allowed to override any existing * non-permanent entry. This means that when we remove this entry, the * system automatically returns to the old behavior. */ list_add_rcu(&p->list, last_perm); out: spin_unlock_bh(&inetsw_lock); synchronize_net(); return; out_permanent: printk(KERN_ERR "Attempt to override permanent protocol %d.\n", protocol); goto out; out_illegal: printk(KERN_ERR "Ignoring attempt to register invalid socket type %d.\n", p->type); goto out; }

很明显在上面的循环中找到适合的链头位置，将我们的数组中的元素一一注册登记到数组中。我们这里要看一下inet_protosw结构

struct inet_protosw { struct list_head list; /* These two fields form the lookup key. */ unsigned short type; /* This is the 2nd argument to socket(2). */ unsigned short protocol; /* This is the L4 protocol number. */ struct proto *prot; const struct proto_ops *ops; int capability; /* Which (if any) capability do * we need to use this socket * interface? */ char no_check; /* checksum on rcv/xmit/none? */ unsigned char flags; /* See INET_PROTOSW_* below. */ };

这个结构是专门用于采用IP协议的socket使用。其内部的变量我们暂时不做分析，也不转译了，我们还是坚持“用时学习”的观念。回到inet_create()函数中，我们已经在inetsw数组中找到了我们的协议类型的链头就会取得其宿主inet_protosw 结构，然后answer = list_entry(p, struct inet_protosw, list);接着函数中对其兼容性进行了检测，最关键的地方是

sock->ops = answer->ops; answer_prot = answer->prot;

这二句首先是为socket的协议操作函数进行了挂钩，我们就要看上面的

{ .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .capability = -1, .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, },

结合这个元素的设置我们明白了，上面socket的协议操作函数被设置成了inet_stream_ops()，而answer_prot设置成了tcp_prot结构。这是个struct proto结构，我们不看了，其内容很多，但是这个结构的作用得强调一下，它是专门用于socket的传输层使用的结构，而用于网络传输层的结构由另一个结构体来表示struct inet_proto。接着函数中分配了一个sock结构。我是无名小卒，尽管3月份才写博客其实研究内核很多年了，写这些博客是为了与朋友们共享知识发扬copyleft精神，所以请转载的朋友注明出处。分配函数我们曾经在unix的socket创建过程中谈到过，我们不细细研究这个函数了，不过要注意其内部的关键地方sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);我们看到他传递了一个answer_prot即我们说的用于socket传输层的钩子函数给sk_alloc。

struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot) { struct sock *sk; sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); if (sk) { sk->sk_family = family; /* * See comment in struct sock definition to understand * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); sock_net_set(sk, get_net(net)); } return sk; }

注意上面sk->sk_prot = sk->sk_prot_creator = prot;将传输层的钩子函数挂入到了sock中了。再回到inet_create函数中，代码是一些对sock的初始化，注意sk是sock结构，而sock是socket结构。使用inet = inet_sk(sk);使sk赋值给struct inet_sock *inet结构变量，我们在unix的socket中曾经说了关于unix 的socket是unix_sock，而这里用于INET的socket结构inet_sock。然后函数通过sock_init_data(sock, sk);对sk进一步的初始化操作，并让sock和sk挂起钩来，这个函数我们已经在那些unix 的socket创建文章中分析过了，请不明白的朋友们看那里的文章学习。下面函数中最关键的地方

if (sk->sk_prot->init) { err = sk->sk_prot->init(sk);

这个就是调用了我们上面提到的传输层结构中的钩子函数init.我们看到上面是tcp_prot结构变量。所以进入其init函数中

struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .recvmsg = tcp_recvmsg, .backlog_rcv = tcp_v4_do_rcv, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem = sysctl_tcp_wmem, .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif };

我是无名小卒，转载请注明出处，不要担心结构有多么大，很多朋友在看代码时总是被结构所吓倒，要知道一个好的结构体也不是一天所成更不是一人所完成的，所以我们要针对场景来分析记忆。其实上面我们就只关心一个地方.init = tcp_v4_init_sock,，进入钩子函数

static int tcp_v4_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ tp->snd_cwnd = 2; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; icsk->icsk_ca_ops = &tcp_init_congestion_ops; sk->sk_state = TCP_CLOSE; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); icsk->icsk_af_ops = &ipv4_specific; icsk->icsk_sync_mss = tcp_sync_mss; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv4_specific; #endif sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; atomic_inc(&tcp_sockets_allocated); return 0; }

首先函数出现了一个新的结构体struct tcp_sock，这个结构非常大，但是其作用随着我们的分析会越来越清晰，上面代码中对这个结构变量tp和sock结构变量sk进一步的初始化，对后这个地方的初始化工作我们要经常回顾，例如这里的sk->sk_write_space = sk_stream_write_space等钩子函数的挂入，以及缓冲区的相关设置。最后我们回到上一节提到的sys_socket()函数中执行retval = sock_map_fd(sock)来完成创建过程，sock_map_fd我们在unix的socket中详细分析了，他在文件系统中分配一个文件号，以及file文件指针和目录项dentry结构使其与socket挂上钩，最后返回分配的文件号，至此以后我们可以根据这个文件号对创建的socket进行其他操作了。相关内容看unix的socket创建http://blog.chinaunix.net/u2/64681/showart_1300200.html。

阅读(5207) | 评论(1) | 转发(0) |

上一篇：内核中的TCP的追踪分析－1-追踪TCP（IPV4)的socket的初始化

下一篇：内核中的TCP的追踪分析－3-TCP（IPV4)的socket的地址绑定

给主人留下些什么吧！~~

chinaunix网友2008-11-18 10:43:06

sk_stream_write_space 的作用是啥.

回复 | 举报

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6