2015年(65)
分类: LINUX
2015-11-08 21:23:32
该函数主要是在用户空间使用listen系统调用函数进行调用执行,在Linux内核中的还是使用System call vectors实现,在net/socket.c文件中
SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
……….
case SYS_LISTEN:
err = sys_listen(a0, a1);
break;
……………………
}
其中sys_listen(a0, a1);最终调用的是下面SYSCALL_DEFINE2(listen, int, fd, int, backlog)进行处理。
SYSCALL_DEFINE2(listen, int, fd, int, backlog)
{
struct socket *sock;
int err, fput_needed;
int somaxconn;// 表示socket监听(listen)的backlog上限
/*类似于bind中调用,这里也是根据创建返回的文件描述符查找一个socket的实例,在ctreate中创建的,具体查找方式就不在分析,不熟悉请参考上篇文章,这时主要通过file实例中的private_data成员获取到sock*/
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
/* /proc/sys/net/core# cat somaxconn
128
这里默认时128,Hadoop集群时一般都会增大该值。
*/
somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
if ((unsigned int)backlog > somaxconn)//如果backlog值大于somaxconn,backlog就位somaxconn,也就是最大值不能大于somaxconn
backlog = somaxconn;
err = security_socket_listen(sock, backlog);
/*调用对应的socket层的listen函数,如果是TCP的话,inet_listen,根据net/ipv4/af_inet.c文件中的const struct proto_ops inet_stream_ops = {.listen = inet_listen,}定义*/
if (!err)
err = sock->ops->listen(sock, backlog);
fput_light(sock->file, fput_needed);
}
return err;
}
通过man listen我们看看对backlog的解释
The behavior of the backlog argument on TCP sockets changed with Linux 2.2. Now it specifies the queue length for completely established sockets waiting to be accepted, instead of the number of incomplete connection requests.
The maximum length of the queue for incomplete sockets can be set using /proc/sys/net/ipv4/tcp_max_syn_backlog.When syncookies are enabled there is no logical maximum length and this setting is ignored. See tcp(7) for more information.
If the backlog argument is greater than the value in /proc/sys/net/core/somaxconn, then it is silently truncated to that value; the default value in this file is 128. In kernels before 2.4.25, this limit was a hard coded value, SOMAXCONN, with the value 128.
上面的解释的大体意思为:从Linux2.2内核版本开始,backlog的行为发生了改变,现在该参数指定了等待accepted的全连接队列的长度。而不是半连接的请求的队列长度。全连接需要在完成三次握手之后。
半连接最大长度可以使用/proc/sys/net/ipv4/tcp_max_syn_backlog进行设置。这个默认值为cat /proc/sys/net/ipv4/tcp_max_syn_backlog
1024
当syncookies被设置后,该参数被忽略掉。如果backlog值大于/proc/sys/net/core/somaxconn,它将被截断,默认值为128。也就是 当传参backlog的值 >= somaxconn时,已完成连结队列的数量最多就是somaxconn.
该函数主要是做一些检查工作,例如当前连接的状态,sock的类型,最主要的处理在inet_csk_listen_start函数中。
int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
int err;
lock_sock(sk);
err = -EINVAL;
/*检查sock的状态是否为SS_UNCONNECTED,sock的类型是否为SOCK_STREAM ,只有SOCK_STREAM 类型的sock才需要进行listen,建立socket后的初始状态为SS_UNCONNECTED */
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
goto out;
/*获取sock的当前状态,后续要变成老状态*/
old_state = sk->sk_state;
/*当前连接的状态需要CLOSED状态和LISTEN状态*/
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
如果现在的状态不是监听状态*/
if (old_state != TCP_LISTEN) {
/* Check special setups for testing purpose to enable TFO w/o
* requiring TCP_FASTOPEN sockopt.
* Note that only TCP sockets (SOCK_STREAM) will reach here.
* Also fastopenq may already been allocated because this
* socket was in TCP_LISTEN state previously but was
* shutdown() (rather than close()).
*/
if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
err = fastopen_init_queue(sk, backlog);
else if ((sysctl_tcp_fastopen &
TFO_SERVER_WO_SOCKOPT2) != 0)
err = fastopen_init_queue(sk,
((uint)sysctl_tcp_fastopen) >> 16);
else
err = 0;
if (err)
goto out;
}
err = inet_csk_listen_start(sk, backlog);//启动监听功能
if (err)
goto out;
}
/* 如果socket的状态已经处于监听状态,这里只是对backlog进行了调整*/
sk->sk_max_ack_backlog = backlog;
err = 0;
out:
release_sock(sk);
return err;
}
该函数使TCP传输控制块进入监听状态,实现监听的过程是:为管理连接请求的散列表分配存储空间,接着使TCP的sock状态迁移到LISTEN状态,然后将sock加入到监听散列表中。
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
/*初始化全连接队列*/
int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
if (rc != 0)
return rc;
sk->sk_max_ack_backlog = 0;//最大的全连接队列
sk->sk_ack_backlog = 0;//当前的全连接队列
inet_csk_delack_init(sk);
/* There is race window here: we announce ourselves listening,
* but this transition is still not validated by get_port().
* It is OK, because this socket enters to hash table only
* after validation is complete.
*/
sk->sk_state = TCP_LISTEN;//设置现在的状态为TCP_LISTEN状态
/*检查端口号是否可用,防止bind后修改。struct proto tcp_prot = { .unhash = inet_unhash,
.get_port = inet_csk_get_port, }调用get_port函数与bind时调用的是同一个函数如果正确返回为0,其中inet_num就是bind是bind的端口,如果没有绑定端口,则进行绑定端口操作*/
if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
inet->inet_sport = htons(inet->inet_num);
sk_dst_reset(sk);
sk->sk_prot->hash(sk);/*把Socket添加到监听HASH表中,struct proto tcp_prot = { .hash = inet_hash,*/
return 0;
}
sk->sk_state = TCP_CLOSE;//如果端口不再可用,设置socket的状态为TCP_CLOSE,并销毁全连接队列
__reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -EADDRINUSE;
}
int reqsk_queue_alloc(struct request_sock_queue *queue,
unsigned int nr_table_entries)
{
size_t lopt_size = sizeof(struct listen_sock);
struct listen_sock *lopt;
/*这里nr_table_entries 最大值传进来的是128,sysctl_max_syn_backlog 值为256,所以,这里最小值不会小于8,最大值不会大于128,在[8.128]之间*/
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
nr_table_entries = max_t(u32, nr_table_entries, 8);
/*取一个最接近2^n的值赋给nr_table_entries */
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
lopt_size += nr_table_entries * sizeof(struct request_sock *);//确定队列大小
if (lopt_size > PAGE_SIZE)
lopt = vzalloc(lopt_size);//如果申请的空间大于1页,则申请虚拟地址空间连续
else
lopt = kzalloc(lopt_size, GFP_KERNEL);//小于1页,在常规内存中分配内存
if (lopt == NULL)
return -ENOMEM;
/* for循环是计算nr_table_entries以2为底的对数,计算的结果就存储在max_qlen_log成员中,例如:如果nr_table_entries =1024,max_qlen_log= 10*/
for (lopt->max_qlen_log = 3;
(1 << lopt->max_qlen_log) < nr_table_entries;
lopt->max_qlen_log++);
/*上面的这几行代码实际上是确认了半连接队列的长度,这个值还受系统配置sysctl_max_syn_backlog的影响,所以如果想调大监听套接字的半连接队列,除了增大listen()的backlog参数外,还需要调整sysctl_max_syn_backlog系统配置的值, proc文件为/proc/sys/net/ipv4/tcp_max_syn_backlog */
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));//得到一个随机数,用于HASH
rwlock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = NULL;//全连接队列置为空
lopt->nr_table_entries = nr_table_entries;//半连接队列的最大长度
write_lock_bh(&queue->syn_wait_lock);
queue->listen_opt = lopt;//初始化半连接队列,其实就是icsk_accept_queue.listen_opt->syn_table
write_unlock_bh(&queue->syn_wait_lock);
return 0;
}
void inet_hash(struct sock *sk)
{
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
__inet_hash(sk);
local_bh_enable();
}
}
static void __inet_hash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
/*Socket不处于监听状态*/
if (sk->sk_state != TCP_LISTEN) {
__inet_hash_nolisten(sk, NULL);/*这里对应的是已经建立连接的*/
return;
}
WARN_ON(!sk_unhashed(sk));
/*根据监听的端口号,查找相对应的HASH*/
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
spin_lock(&ilb->lock);
/*把sock添加到监听HASH桶的头部,连接到sk->sk_nulls_node */
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
spin_unlock(&ilb->lock);
}
(1)listen初始化了半连接队列和全连接队列
(2)实现侦听,使TCP传输控制块的状态迁移到LISTEN状态,然后将传输控制块添加到侦听散列表中