今天真是不幸,写好的文章竟然在编辑器里被错误冲刷掉了,本来已经完成了本章节的内容,就要在最后关头出现了一个IE错误,现在是重写的,请朋友们见谅。我是无名小卒,请转载的朋友注明出处,谢谢。昨天我们看到了inet_create()函数我们今天继续。
static int inet_create(struct net *net, struct socket *sock, int protocol) { struct sock *sk; struct list_head *p; struct inet_protosw *answer; struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; char answer_no_check; int try_loading_module = 0; int err;
if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM && !inet_ehash_secret) build_ehash_secret();
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */ answer = NULL;
|
这个函数首先是检查是不是原始的socket和udp的socket,并且判断是否已经有了加密字符如果没有就会调用build_ehash_secret来分配一个
void build_ehash_secret(void) { u32 rnd; do { get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); spin_lock_bh(&inetsw_lock); if (!inet_ehash_secret) inet_ehash_secret = rnd; spin_unlock_bh(&inetsw_lock); }
|
get_random_bytes是取得一个随机数即“熵”,取得后赋值给加密字符使用。然后上面的函数中将socket设置为未连接状态。我们继续往下看
lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_rcu(p, &inetsw[sock->type]) { answer = list_entry(p, struct inet_protosw, list);
/* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } err = -EPROTONOSUPPORT; answer = NULL; }
if (unlikely(answer == NULL)) { if (try_loading_module < 2) { rcu_read_unlock(); /* * Be more specific, e.g. net-pf-2-proto-132-type-1 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module == 1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET, protocol, sock->type); /* * Fall back to generic, e.g. net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET, protocol); goto lookup_protocol; } else goto out_rcu_unlock; }
err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock;
err = -EAFNOSUPPORT; if (!inet_netns_ok(net, protocol)) goto out_rcu_unlock;
sock->ops = answer->ops; answer_prot = answer->prot; answer_no_check = answer->no_check; answer_flags = answer->flags; rcu_read_unlock();
BUG_TRAP(answer_prot->slab != NULL);
err = -ENOBUFS; sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); if (sk == NULL) goto out;
err = 0; sk->sk_no_check = answer_no_check; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = 1;
inet = inet_sk(sk); inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
if (SOCK_RAW == sock->type) { inet->num = protocol; if (IPPROTO_RAW == protocol) inet->hdrincl = 1; }
if (ipv4_config.no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT;
inet->id = 0;
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct; sk->sk_family = PF_INET; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1; inet->mc_loop = 1; inet->mc_ttl = 1; inet->mc_index = 0; inet->mc_list = NULL;
sk_refcnt_debug_inc(sk);
if (inet->num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ inet->sport = htons(inet->num); /* Add to protocol hash chains. */ sk->sk_prot->hash(sk); }
if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); if (err) sk_common_release(sk); } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; }
|
这段代码看似复杂其实分析起来并不算难,首先上面
list_for_each_rcu(p, &inetsw[sock->type])是一个宏,我们看一下
#define list_for_each_rcu(pos, head) \ for (pos = rcu_dereference((head)->next); \ prefetch(pos->next), pos != (head); \ pos = rcu_dereference(pos->next))
|
这段宏我贴些资料供大家理解,下面这些内容出自
RCU(Read-Copy Update)通过延迟写操作来提高同步性能,具体请参见第3章。这里只分析具有RCU的链表。 RCU常用来保护读操作占多数的链表与数组。具有RCU的链表的操作函数与普通链表操作函数的区别是在函数名后加上了_rcu,如list_for_each_rcu函数。 函数list_for_each_rcu的功能是遍历一个rcu保护的链表。其中,参数pos表示用来做链表位置计数的&struct list_head结构,参数head表示链表头。只要遍历被rcu_read_lock()保护,使用诸如list_add_rcu()的函数对链表同时访问是安全的。 函数List_for_each_rcu列出如下:
#define list_for_each_rcu(pos, head) \ for (pos = (head)->next, prefetch(pos->next); pos != (head); \ pos = rcu_dereference(pos->next), prefetch(pos->next))
函数rcu_dereference在RCU读临界部分中取出一个RCU保护的指针。在需要内存屏障的体系中进行内存屏障(目前只有Alpha体系需要),函数列出如下:
#define rcu_dereference(p) ({ \ typeof(p) _________p1 = p; \ smp_read_barrier_depends(); \ (_________p1); \ })
在include/asm-i386/system.h中:
#define smp_read_barrier_depends() read_barrier_depends()
|
很明显上面的宏就是循环检查inetsw数组找到符合我们socket类型的链头,那么这个数组是什么时候初始化的呢?我们再象上一节那样看一下
static int __init inet_init(void) { 。。。。。。 for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) inet_register_protosw(q); 。。。。。。 }
|
我们看到他循环用inet_register_protosw()函数处理inetsw_array数组中的元素
static struct inet_protosw inetsw_array[] = { { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .capability = -1, .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, },
{ .type = SOCK_DGRAM, .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, .capability = -1, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, },
{ .type = SOCK_RAW, .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, .capability = CAP_NET_RAW, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, } };
|
我们结合应用程序的练习看一下
server_sockfd = socket(AF_INET, SOCK_STREAM, 0);
|
这里我们看到在上面的数组中第一个元素就是我们需要的,他是被下面的函数登记到数组中的
void inet_register_protosw(struct inet_protosw *p) { struct list_head *lh; struct inet_protosw *answer; int protocol = p->protocol; struct list_head *last_perm;
spin_lock_bh(&inetsw_lock);
if (p->type >= SOCK_MAX) goto out_illegal;
/* If we are trying to override a permanent protocol, bail. */ answer = NULL; last_perm = &inetsw[p->type]; list_for_each(lh, &inetsw[p->type]) { answer = list_entry(lh, struct inet_protosw, list);
/* Check only the non-wild match. */ if (INET_PROTOSW_PERMANENT & answer->flags) { if (protocol == answer->protocol) break; last_perm = lh; }
answer = NULL; } if (answer) goto out_permanent;
/* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with * a wild-card protocol. But it is allowed to override any existing * non-permanent entry. This means that when we remove this entry, the * system automatically returns to the old behavior. */ list_add_rcu(&p->list, last_perm); out: spin_unlock_bh(&inetsw_lock);
synchronize_net();
return;
out_permanent: printk(KERN_ERR "Attempt to override permanent protocol %d.\n", protocol); goto out;
out_illegal: printk(KERN_ERR "Ignoring attempt to register invalid socket type %d.\n", p->type); goto out; }
|
很明显在上面的循环中找到适合的链头位置,将我们的数组中的元素一一注册登记到数组中。我们这里要看一下inet_protosw结构
struct inet_protosw { struct list_head list;
/* These two fields form the lookup key. */ unsigned short type; /* This is the 2nd argument to socket(2). */ unsigned short protocol; /* This is the L4 protocol number. */
struct proto *prot; const struct proto_ops *ops; int capability; /* Which (if any) capability do * we need to use this socket * interface? */ char no_check; /* checksum on rcv/xmit/none? */ unsigned char flags; /* See INET_PROTOSW_* below. */ };
|
这个结构是专门用于采用IP协议的socket使用。其内部的变量我们暂时不做分析,也不转译了,我们还是坚持“用时学习”的观念。回到inet_create()函数中,我们已经在inetsw数组中找到了我们的协议类型的链头就会取得其宿主inet_protosw 结构,然后answer = list_entry(p, struct inet_protosw, list);接着函数中对其兼容性进行了检测,最关键的地方是
sock->ops = answer->ops; answer_prot = answer->prot;
|
这二句首先是为socket的协议操作函数进行了挂钩,我们就要看上面的
{ .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .capability = -1, .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, },
|
结合这个元素的设置我们明白了,上面socket的协议操作函数被设置成了inet_stream_ops(),而answer_prot设置成了tcp_prot结构。这是个struct proto结构,我们不看了,其内容很多,但是这个结构的作用得强调一下,它是专门用于socket的传输层使用的结构,而用于网络传输层的结构由另一个结构体来表示struct inet_proto。接着函数中分配了一个sock结构。我是无名小卒,尽管3月份才写博客其实研究内核很多年了,写这些博客是为了与朋友们共享知识发扬copyleft精神,所以请转载的朋友注明出处。分配函数我们曾经在unix的socket创建过程中谈到过,我们不细细研究这个函数了,不过要注意其内部的关键地方sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);我们看到他传递了一个answer_prot即我们说的用于socket传输层的钩子函数给sk_alloc。
struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot) { struct sock *sk;
sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); if (sk) { sk->sk_family = family; /* * See comment in struct sock definition to understand * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; sock_lock_init(sk); sock_net_set(sk, get_net(net)); }
return sk; }
|
注意上面sk->sk_prot = sk->sk_prot_creator = prot;将传输层的钩子函数挂入到了sock中了。再回到inet_create函数中,代码是一些对sock的初始化,注意sk是sock结构,而sock是socket结构。使用inet = inet_sk(sk);使sk赋值给struct inet_sock *inet结构变量,我们在unix的socket中曾经说了关于unix 的socket是unix_sock,而这里用于INET的socket结构inet_sock。然后函数通过sock_init_data(sock, sk);对sk进一步的初始化操作,并让sock和sk挂起钩来,这个函数我们已经在那些unix 的socket创建文章中分析过了,请不明白的朋友们看那里的文章学习。下面函数中最关键的地方
if (sk->sk_prot->init) { err = sk->sk_prot->init(sk);
|
这个就是调用了我们上面提到的传输层结构中的钩子函数init.我们看到上面是tcp_prot结构变量。所以进入其init函数中
struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .recvmsg = tcp_recvmsg, .backlog_rcv = tcp_v4_do_rcv, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem = sysctl_tcp_wmem, .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif };
|
我是无名小卒,转载请注明出处,不要担心结构有多么大,很多朋友在看代码时总是被结构所吓倒,要知道一个好的结构体也不是一天所成更不是一人所完成的,所以我们要针对场景来分析记忆。其实上面我们就只关心一个地方.init = tcp_v4_init_sock,,进入钩子函数
static int tcp_v4_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk);
skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp);
icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT;
/* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ tp->snd_cwnd = 2;
/* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering; icsk->icsk_ca_ops = &tcp_init_congestion_ops;
sk->sk_state = TCP_CLOSE;
sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
icsk->icsk_af_ops = &ipv4_specific; icsk->icsk_sync_mss = tcp_sync_mss; #ifdef CONFIG_TCP_MD5SIG tp->af_specific = &tcp_sock_ipv4_specific; #endif
sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1];
atomic_inc(&tcp_sockets_allocated);
return 0; }
|
首先函数出现了一个新的结构体struct tcp_sock,这个结构非常大,但是其作用随着我们的分析会越来越清晰,上面代码中对这个结构变量tp和sock结构变量sk进一步的初始化,对后这个地方的初始化工作我们要经常回顾,例如这里的sk->sk_write_space = sk_stream_write_space等钩子函数的挂入,以及缓冲区的相关设置。最后我们回到上一节提到的sys_socket()函数中执行retval = sock_map_fd(sock)来完成创建过程,sock_map_fd我们在unix的socket中详细分析了,他在文件系统中分配一个文件号,以及file文件指针和目录项dentry结构使其与socket挂上钩,最后返回分配的文件号,至此以后我们可以根据这个文件号对创建的socket进行其他操作了。相关内容看unix的socket创建http://blog.chinaunix.net/u2/64681/showart_1300200.html。
阅读(5180) | 评论(1) | 转发(0) |