版权所有,转载请注明出处。
Author: Tony
今天一个可爱的同事问我同一个socket能否bind多个port?说实话,真没这么玩过。本着严谨的态度,决定研究一下这样是否可以。
说明本内容针对的内核版本是SLES 11 SP2 3.0.13-0.27
首先分析系统调用函数bind,内核层实现在net/socket.c中:
-
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
-
{
-
struct socket *sock;
-
struct sockaddr_storage address;
-
int err, fput_needed;
-
-
sock = sockfd_lookup_light(fd, &err, &fput_needed);
-
if (sock) {
-
err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
-
if (err >= 0) {
-
err = security_socket_bind(sock,
-
(struct sockaddr *)&address,
-
addrlen);
-
if (!err)
-
/*这里调用对应proto_ops的bind,这里以tcp为例说明
-
proto_ops实现了从协议无关的套接口层到协议相关的传输层的转接
-
*/
-
err = sock->ops->bind(sock,
-
(struct sockaddr *)
-
&address, addrlen);
-
}
-
fput_light(sock->file, fput_needed);
-
}
-
return err;
-
}
在net/ipv4/af_inet.c中,inet_protosw定义了tcp传输层的proto_ops
-
static struct inet_protosw inetsw_array[] =
-
{
-
{
-
.type = SOCK_STREAM,
-
.protocol = IPPROTO_TCP,
-
.prot = &tcp_prot,
-
/*这里指定了tcp传输层协议的proto_ops*/
-
.ops = &inet_stream_ops,
-
.no_check = 0,
-
.flags = INET_PROTOSW_PERMANENT |
-
INET_PROTOSW_ICSK,
-
},
-
-
{
-
.type = SOCK_DGRAM,
-
.protocol = IPPROTO_UDP,
-
.prot = &udp_prot,
-
.ops = &inet_dgram_ops,
-
.no_check = UDP_CSUM_DEFAULT,
-
.flags = INET_PROTOSW_PERMANENT,
-
},
-
-
{
-
.type = SOCK_DGRAM,
-
.protocol = IPPROTO_ICMP,
-
.prot = &ping_prot,
-
.ops = &inet_dgram_ops,
-
.no_check = UDP_CSUM_DEFAULT,
-
.flags = INET_PROTOSW_REUSE,
-
},
-
-
{
-
.type = SOCK_RAW,
-
.protocol = IPPROTO_IP, /* wild card */
-
.prot = &raw_prot,
-
.ops = &inet_sockraw_ops,
-
.no_check = UDP_CSUM_DEFAULT,
-
.flags = INET_PROTOSW_REUSE,
-
}
-
};
inet_stream_ops的定义也在net/ipv4/af_inet.c中
-
const struct proto_ops inet_stream_ops = {
-
.family = PF_INET,
-
.owner = THIS_MODULE,
-
.release = inet_release,
-
/*这里指定了tcp协议proto_ops的bind函数为inet_bind*/
-
.bind = inet_bind,
-
.connect = inet_stream_connect,
-
.socketpair = sock_no_socketpair,
-
.accept = inet_accept,
-
.getname = inet_getname,
-
.poll = tcp_poll,
-
.ioctl = inet_ioctl,
-
.listen = inet_listen,
-
.shutdown = inet_shutdown,
-
.setsockopt = sock_common_setsockopt,
-
.getsockopt = sock_common_getsockopt,
-
.sendmsg = inet_sendmsg,
-
.recvmsg = inet_recvmsg,
-
.mmap = sock_no_mmap,
-
.sendpage = inet_sendpage,
-
.splice_read = tcp_splice_read,
-
#ifdef CONFIG_COMPAT
-
.compat_setsockopt = compat_sock_common_setsockopt,
-
.compat_getsockopt = compat_sock_common_getsockopt,
-
.compat_ioctl = inet_compat_ioctl,
-
#endif
-
};
inet_bind的实现也在文件net/ipv4/af_inet.c中
-
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
-
{
-
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
-
struct sock *sk = sock->sk;
-
struct inet_sock *inet = inet_sk(sk);
-
unsigned short snum;
-
int chk_addr_ret;
-
int err;
-
-
/* If the socket has its own bind function then use it. (RAW) */
-
if (sk->sk_prot->bind) {
-
err = sk->sk_prot->bind(sk, uaddr, addr_len);
-
goto out;
-
}
-
err = -EINVAL;
-
if (addr_len < sizeof(struct sockaddr_in))
-
goto out;
-
-
if (addr->sin_family != AF_INET) {
-
/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
-
* only if s_addr is INADDR_ANY.
-
*/
-
err = -EAFNOSUPPORT;
-
if (addr->sin_family != AF_UNSPEC ||
-
addr->sin_addr.s_addr != htonl(INADDR_ANY))
-
goto out;
-
}
-
-
chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
-
-
/* Not specified by any standard per-se, however it breaks too
-
* many applications when removed. It is unfortunate since
-
* allowing applications to make a non-local bind solves
-
* several problems with systems using dynamic addressing.
-
* (ie. your servers still start up even if your ISDN link
-
* is temporarily down)
-
*/
-
err = -EADDRNOTAVAIL;
-
if (!sysctl_ip_nonlocal_bind &&
-
!(inet->freebind || inet->transparent) &&
-
addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
-
chk_addr_ret != RTN_LOCAL &&
-
chk_addr_ret != RTN_MULTICAST &&
-
chk_addr_ret != RTN_BROADCAST)
-
goto out;
-
-
snum = ntohs(addr->sin_port);
-
err = -EACCES;
-
if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
-
goto out;
-
-
/* We keep a pair of addresses. rcv_saddr is the one
-
* used by hash lookups, and saddr is used for transmit.
-
*
-
* In the BSD API these are the same except where it
-
* would be illegal to use them (multicast/broadcast) in
-
* which case the sending device address is used.
-
*/
-
lock_sock(sk);
-
-
/* Check these errors (active socket, double bind). */
-
err = -EINVAL;
-
/*这里会校验要bind的sock是不是在TCP_CLOSE状态(未使用),以及当前
-
sock是否已经bind相应的端口。
-
当该sock第一次bind的时候,inet->inet_num肯定是0,当bind一次后,
-
这个inet_num就变为了非零,从而一个socket最多bind一个端口,如果
-
尝试bind多个端口将会返回错误22(EINVAL)
-
下面我们看一下,inet_num是在什么时候被复制的?
-
*/
-
if (sk->sk_state != TCP_CLOSE || inet->inet_num)
-
goto out_release_sock;
-
-
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
-
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
-
inet->inet_saddr = 0; /* Use device */
-
-
/* Make sure we are allowed to bind here. */
-
/* 这个函数会根据传入的参数,选择一个可用的端口进行bind,
-
sk->sk_prot就是网络控制块sock的操作函数,struct proto
-
实现了传输层到网络层的转换。
-
tcp sock的get_port函数是inet_csk_get_port函数
-
tcp_prot(struct proto)在net/ipv4/tcp_ipv4.c中定义
-
*/
-
if (sk->sk_prot->get_port(sk, snum)) {
-
inet->inet_saddr = inet->inet_rcv_saddr = 0;
-
err = -EADDRINUSE;
-
goto out_release_sock;
-
}
-
-
if (inet->inet_rcv_saddr)
-
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
-
if (snum)
-
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
-
inet->inet_sport = htons(inet->inet_num);
-
inet->inet_daddr = 0;
-
inet->inet_dport = 0;
-
sk_dst_reset(sk);
-
err = 0;
-
out_release_sock:
-
release_sock(sk);
-
out:
-
return err;
-
}
inet_csk_get_port的实现在net/ipv4/inet_connection_sock.c中
-
/* Obtain a reference to a local port for the given sock,
-
* if snum is zero it means select any available local port.
-
*/
-
int inet_csk_get_port(struct sock *sk, unsigned short snum)
-
{
-
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-
struct inet_bind_hashbucket *head;
-
struct hlist_node *node;
-
struct inet_bind_bucket *tb;
-
int ret, attempts = 5;
-
struct net *net = sock_net(sk);
-
int smallest_size = -1, smallest_rover;
-
-
local_bh_disable();
-
/*从这里可用看的,如果传入的端口为0,那么内核会自动选择一个可用的端口*/
-
if (!snum) {
-
int remaining, rover, low, high;
-
-
again:
-
inet_get_local_port_range(&low, &high);
-
remaining = (high - low) + 1;
-
smallest_rover = rover = net_random() % remaining + low;
-
-
smallest_size = -1;
-
do {
-
if (inet_is_reserved_local_port(rover))
-
goto next_nolock;
-
head = &hashinfo->bhash[inet_bhashfn(net, rover,
-
hashinfo->bhash_size)];
-
spin_lock(&head->lock);
-
inet_bind_bucket_for_each(tb, node, &head->chain)
-
if (net_eq(ib_net(tb), net) && tb->port == rover) {
-
if (tb->fastreuse > 0 &&
-
sk->sk_reuse &&
-
sk->sk_state != TCP_LISTEN &&
-
(tb->num_owners < smallest_size || smallest_size == -1)) {
-
smallest_size = tb->num_owners;
-
smallest_rover = rover;
-
if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
-
spin_unlock(&head->lock);
-
snum = smallest_rover;
-
goto have_snum;
-
}
-
}
-
goto next;
-
}
-
break;
-
next:
-
spin_unlock(&head->lock);
-
next_nolock:
-
if (++rover > high)
-
rover = low;
-
} while (--remaining > 0);
-
-
/* Exhausted local port range during search? It is not
-
* possible for us to be holding one of the bind hash
-
* locks if this test triggers, because if 'remaining'
-
* drops to zero, we broke out of the do/while loop at
-
* the top level, not from the 'break;' statement.
-
*/
-
ret = 1;
-
if (remaining <= 0) {
-
if (smallest_size != -1) {
-
snum = smallest_rover;
-
goto have_snum;
-
}
-
goto fail;
-
}
-
/* OK, here is the one we will use. HEAD is
-
* non-NULL and we hold it's mutex.
-
*/
-
snum = rover;
-
} else {
-
have_snum:
-
head = &hashinfo->bhash[inet_bhashfn(net, snum,
-
hashinfo->bhash_size)];
-
spin_lock(&head->lock);
-
inet_bind_bucket_for_each(tb, node, &head->chain)
-
if (net_eq(ib_net(tb), net) && tb->port == snum)
-
goto tb_found;
-
}
-
tb = NULL;
-
goto tb_not_found;
-
tb_found:
-
if (!hlist_empty(&tb->owners)) {
-
if (tb->fastreuse > 0 &&
-
sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
-
smallest_size == -1) {
-
goto success;
-
} else {
-
ret = 1;
-
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
-
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
-
smallest_size != -1 && --attempts >= 0) {
-
spin_unlock(&head->lock);
-
goto again;
-
}
-
goto fail_unlock;
-
}
-
}
-
}
-
tb_not_found:
-
ret = 1;
-
if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
-
net, head, snum)) == NULL)
-
goto fail_unlock;
-
if (hlist_empty(&tb->owners)) {
-
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
-
tb->fastreuse = 1;
-
else
-
tb->fastreuse = 0;
-
} else if (tb->fastreuse &&
-
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
-
tb->fastreuse = 0;
-
success:
-
if (!inet_csk(sk)->icsk_bind_hash)
-
/*到这里说明端口是可用的了,需要将该sk挂接到该端口的owners链表中
-
我们看一下这个函数的实现。
-
*/
-
inet_bind_hash(sk, tb, snum);
-
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
-
ret = 0;
-
-
fail_unlock:
-
spin_unlock(&head->lock);
-
fail:
-
local_bh_enable();
-
return ret;
-
}
inet_bind_hash函数在net/ipv4/inet_hashtables.c中实现
-
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
-
const unsigned short snum)
-
{
-
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-
-
atomic_inc(&hashinfo->bsockets);
-
/*
-
在这里将sock的inet_num置为选中的端口。
-
*/
-
inet_sk(sk)->inet_num = snum;
-
sk_add_bind_node(sk, &tb->owners);
-
tb->num_owners++;
-
inet_csk(sk)->icsk_bind_hash = tb;
-
}
综上的分析,可用看出同一个socket只可以bind最多一个端口,如果bind超过1个端口,会返回错误22(EINVAL)。
阅读(1129) | 评论(0) | 转发(0) |