Chinaunix首页 | 论坛 | 博客
  • 博客访问: 3926732
  • 博文数量: 93
  • 博客积分: 3189
  • 博客等级: 中校
  • 技术积分: 4229
  • 用 户 组: 普通用户
  • 注册时间: 2009-02-02 13:29
个人简介

出没于杭州和青岛的程序猿一枚,对内核略懂一二

文章分类

全部博文(93)

文章存档

2016年(2)

2015年(3)

2014年(11)

2013年(29)

2012年(16)

2011年(5)

2010年(5)

2009年(22)

分类: LINUX

2013-05-03 15:14:39

版权所有,转载请注明出处。
Author: Tony

今天一个可爱的同事问我同一个socket能否bind多个port?说实话,真没这么玩过。本着严谨的态度,决定研究一下这样是否可以。
说明本内容针对的内核版本是SLES 11 SP2 3.0.13-0.27

首先分析系统调用函数bind,内核层实现在net/socket.c中:

点击(此处)折叠或打开

  1. SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
  2. {
  3.     struct socket *sock;
  4.     struct sockaddr_storage address;
  5.     int err, fput_needed;

  6.     sock = sockfd_lookup_light(fd, &err, &fput_needed);
  7.     if (sock) {
  8.         err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
  9.         if (err >= 0) {
  10.             err = security_socket_bind(sock,
  11.                          (struct sockaddr *)&address,
  12.                          addrlen);
  13.             if (!err)
  14.             /*这里调用对应proto_ops的bind,这里以tcp为例说明
  15.               proto_ops实现了从协议无关的套接口层到协议相关的传输层的转接
  16.             */
  17.                 err = sock->ops->bind(sock,
  18.                          (struct sockaddr *)
  19.                          &address, addrlen);
  20.         }
  21.         fput_light(sock->file, fput_needed);
  22.     }
  23.     return err;
  24. }
在net/ipv4/af_inet.c中,inet_protosw定义了tcp传输层的proto_ops

点击(此处)折叠或打开

  1. static struct inet_protosw inetsw_array[] =
  2. {
  3.     {
  4.         .type = SOCK_STREAM,
  5.         .protocol = IPPROTO_TCP,
  6.         .prot = &tcp_prot,
  7.         /*这里指定了tcp传输层协议的proto_ops*/
  8.         .ops = &inet_stream_ops,
  9.         .no_check = 0,
  10.         .flags = INET_PROTOSW_PERMANENT |
  11.              INET_PROTOSW_ICSK,
  12.     },

  13.     {
  14.         .type = SOCK_DGRAM,
  15.         .protocol = IPPROTO_UDP,
  16.         .prot = &udp_prot,
  17.         .ops = &inet_dgram_ops,
  18.         .no_check = UDP_CSUM_DEFAULT,
  19.         .flags = INET_PROTOSW_PERMANENT,
  20.        },

  21.        {
  22.         .type = SOCK_DGRAM,
  23.         .protocol = IPPROTO_ICMP,
  24.         .prot = &ping_prot,
  25.         .ops = &inet_dgram_ops,
  26.         .no_check = UDP_CSUM_DEFAULT,
  27.         .flags = INET_PROTOSW_REUSE,
  28.        },

  29.        {
  30.      .type = SOCK_RAW,
  31.      .protocol = IPPROTO_IP,    /* wild card */
  32.      .prot = &raw_prot,
  33.      .ops = &inet_sockraw_ops,
  34.      .no_check = UDP_CSUM_DEFAULT,
  35.      .flags = INET_PROTOSW_REUSE,
  36.        }
  37. };
inet_stream_ops的定义也在net/ipv4/af_inet.c中

点击(此处)折叠或打开

  1. const struct proto_ops inet_stream_ops = {
  2.     .family         = PF_INET,
  3.     .owner         = THIS_MODULE,
  4.     .release     = inet_release,
  5.     /*这里指定了tcp协议proto_ops的bind函数为inet_bind*/
  6.     .bind         = inet_bind,
  7.     .connect     = inet_stream_connect,
  8.     .socketpair     = sock_no_socketpair,
  9.     .accept         = inet_accept,
  10.     .getname     = inet_getname,
  11.     .poll         = tcp_poll,
  12.     .ioctl         = inet_ioctl,
  13.     .listen         = inet_listen,
  14.     .shutdown     = inet_shutdown,
  15.     .setsockopt     = sock_common_setsockopt,
  16.     .getsockopt     = sock_common_getsockopt,
  17.     .sendmsg     = inet_sendmsg,
  18.     .recvmsg     = inet_recvmsg,
  19.     .mmap         = sock_no_mmap,
  20.     .sendpage     = inet_sendpage,
  21.     .splice_read     = tcp_splice_read,
  22. #ifdef CONFIG_COMPAT
  23.     .compat_setsockopt = compat_sock_common_setsockopt,
  24.     .compat_getsockopt = compat_sock_common_getsockopt,
  25.     .compat_ioctl     = inet_compat_ioctl,
  26. #endif
  27. };
inet_bind的实现也在文件net/ipv4/af_inet.c中

点击(此处)折叠或打开

  1. int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
  2. {
  3.     struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
  4.     struct sock *sk = sock->sk;
  5.     struct inet_sock *inet = inet_sk(sk);
  6.     unsigned short snum;
  7.     int chk_addr_ret;
  8.     int err;

  9.     /* If the socket has its own bind function then use it. (RAW) */
  10.     if (sk->sk_prot->bind) {
  11.         err = sk->sk_prot->bind(sk, uaddr, addr_len);
  12.         goto out;
  13.     }
  14.     err = -EINVAL;
  15.     if (addr_len < sizeof(struct sockaddr_in))
  16.         goto out;

  17.     if (addr->sin_family != AF_INET) {
  18.         /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
  19.          * only if s_addr is INADDR_ANY.
  20.          */
  21.         err = -EAFNOSUPPORT;
  22.         if (addr->sin_family != AF_UNSPEC ||
  23.          addr->sin_addr.s_addr != htonl(INADDR_ANY))
  24.             goto out;
  25.     }

  26.     chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);

  27.     /* Not specified by any standard per-se, however it breaks too
  28.      * many applications when removed. It is unfortunate since
  29.      * allowing applications to make a non-local bind solves
  30.      * several problems with systems using dynamic addressing.
  31.      * (ie. your servers still start up even if your ISDN link
  32.      * is temporarily down)
  33.      */
  34.     err = -EADDRNOTAVAIL;
  35.     if (!sysctl_ip_nonlocal_bind &&
  36.      !(inet->freebind || inet->transparent) &&
  37.      addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
  38.      chk_addr_ret != RTN_LOCAL &&
  39.      chk_addr_ret != RTN_MULTICAST &&
  40.      chk_addr_ret != RTN_BROADCAST)
  41.         goto out;

  42.     snum = ntohs(addr->sin_port);
  43.     err = -EACCES;
  44.     if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
  45.         goto out;

  46.     /* We keep a pair of addresses. rcv_saddr is the one
  47.      * used by hash lookups, and saddr is used for transmit.
  48.      *
  49.      * In the BSD API these are the same except where it
  50.      * would be illegal to use them (multicast/broadcast) in
  51.      * which case the sending device address is used.
  52.      */
  53.     lock_sock(sk);

  54.     /* Check these errors (active socket, double bind). */
  55.     err = -EINVAL;
  56.     /*这里会校验要bind的sock是不是在TCP_CLOSE状态(未使用),以及当前
  57.       sock是否已经bind相应的端口。
  58.       当该sock第一次bind的时候,inet->inet_num肯定是0,当bind一次后,
  59.       这个inet_num就变为了非零,从而一个socket最多bind一个端口,如果
  60.       尝试bind多个端口将会返回错误22(EINVAL)
  61.       下面我们看一下,inet_num是在什么时候被复制的?
  62.     */
  63.     if (sk->sk_state != TCP_CLOSE || inet->inet_num)
  64.         goto out_release_sock;

  65.     inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
  66.     if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
  67.         inet->inet_saddr = 0; /* Use device */

  68.     /* Make sure we are allowed to bind here. */
  69.     /* 这个函数会根据传入的参数,选择一个可用的端口进行bind,
  70.        sk->sk_prot就是网络控制块sock的操作函数,struct proto
  71.        实现了传输层到网络层的转换。
  72.        tcp sock的get_port函数是inet_csk_get_port函数
  73.        tcp_prot(struct proto)在net/ipv4/tcp_ipv4.c中定义
  74.     */
  75.     if (sk->sk_prot->get_port(sk, snum)) {
  76.         inet->inet_saddr = inet->inet_rcv_saddr = 0;
  77.         err = -EADDRINUSE;
  78.         goto out_release_sock;
  79.     }

  80.     if (inet->inet_rcv_saddr)
  81.         sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
  82.     if (snum)
  83.         sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
  84.     inet->inet_sport = htons(inet->inet_num);
  85.     inet->inet_daddr = 0;
  86.     inet->inet_dport = 0;
  87.     sk_dst_reset(sk);
  88.     err = 0;
  89. out_release_sock:
  90.     release_sock(sk);
  91. out:
  92.     return err;
  93. }
inet_csk_get_port的实现在net/ipv4/inet_connection_sock.c中


点击(此处)折叠或打开

  1. /* Obtain a reference to a local port for the given sock,
  2.  * if snum is zero it means select any available local port.
  3.  */
  4. int inet_csk_get_port(struct sock *sk, unsigned short snum)
  5. {
  6.     struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
  7.     struct inet_bind_hashbucket *head;
  8.     struct hlist_node *node;
  9.     struct inet_bind_bucket *tb;
  10.     int ret, attempts = 5;
  11.     struct net *net = sock_net(sk);
  12.     int smallest_size = -1, smallest_rover;

  13.     local_bh_disable();
  14.     /*从这里可用看的,如果传入的端口为0,那么内核会自动选择一个可用的端口*/
  15.     if (!snum) {
  16.         int remaining, rover, low, high;

  17. again:
  18.         inet_get_local_port_range(&low, &high);
  19.         remaining = (high - low) + 1;
  20.         smallest_rover = rover = net_random() % remaining + low;

  21.         smallest_size = -1;
  22.         do {
  23.             if (inet_is_reserved_local_port(rover))
  24.                 goto next_nolock;
  25.             head = &hashinfo->bhash[inet_bhashfn(net, rover,
  26.                     hashinfo->bhash_size)];
  27.             spin_lock(&head->lock);
  28.             inet_bind_bucket_for_each(tb, node, &head->chain)
  29.                 if (net_eq(ib_net(tb), net) && tb->port == rover) {
  30.                     if (tb->fastreuse > 0 &&
  31.                      sk->sk_reuse &&
  32.                      sk->sk_state != TCP_LISTEN &&
  33.                      (tb->num_owners < smallest_size || smallest_size == -1)) {
  34.                         smallest_size = tb->num_owners;
  35.                         smallest_rover = rover;
  36.                         if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
  37.                             spin_unlock(&head->lock);
  38.                             snum = smallest_rover;
  39.                             goto have_snum;
  40.                         }
  41.                     }
  42.                     goto next;
  43.                 }
  44.             break;
  45.         next:
  46.             spin_unlock(&head->lock);
  47.         next_nolock:
  48.             if (++rover > high)
  49.                 rover = low;
  50.         } while (--remaining > 0);

  51.         /* Exhausted local port range during search? It is not
  52.          * possible for us to be holding one of the bind hash
  53.          * locks if this test triggers, because if 'remaining'
  54.          * drops to zero, we broke out of the do/while loop at
  55.          * the top level, not from the 'break;' statement.
  56.          */
  57.         ret = 1;
  58.         if (remaining <= 0) {
  59.             if (smallest_size != -1) {
  60.                 snum = smallest_rover;
  61.                 goto have_snum;
  62.             }
  63.             goto fail;
  64.         }
  65.         /* OK, here is the one we will use. HEAD is
  66.          * non-NULL and we hold it's mutex.
  67.          */
  68.         snum = rover;
  69.     } else {
  70. have_snum:
  71.         head = &hashinfo->bhash[inet_bhashfn(net, snum,
  72.                 hashinfo->bhash_size)];
  73.         spin_lock(&head->lock);
  74.         inet_bind_bucket_for_each(tb, node, &head->chain)
  75.             if (net_eq(ib_net(tb), net) && tb->port == snum)
  76.                 goto tb_found;
  77.     }
  78.     tb = NULL;
  79.     goto tb_not_found;
  80. tb_found:
  81.     if (!hlist_empty(&tb->owners)) {
  82.         if (tb->fastreuse > 0 &&
  83.          sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
  84.          smallest_size == -1) {
  85.             goto success;
  86.         } else {
  87.             ret = 1;
  88.             if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
  89.                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
  90.                  smallest_size != -1 && --attempts >= 0) {
  91.                     spin_unlock(&head->lock);
  92.                     goto again;
  93.                 }
  94.                 goto fail_unlock;
  95.             }
  96.         }
  97.     }
  98. tb_not_found:
  99.     ret = 1;
  100.     if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
  101.                     net, head, snum)) == NULL)
  102.         goto fail_unlock;
  103.     if (hlist_empty(&tb->owners)) {
  104.         if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
  105.             tb->fastreuse = 1;
  106.         else
  107.             tb->fastreuse = 0;
  108.     } else if (tb->fastreuse &&
  109.          (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
  110.         tb->fastreuse = 0;
  111. success:
  112.     if (!inet_csk(sk)->icsk_bind_hash)
  113.         /*到这里说明端口是可用的了,需要将该sk挂接到该端口的owners链表中
  114.           我们看一下这个函数的实现。
  115.         */
  116.         inet_bind_hash(sk, tb, snum);
  117.     WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
  118.     ret = 0;

  119. fail_unlock:
  120.     spin_unlock(&head->lock);
  121. fail:
  122.     local_bh_enable();
  123.     return ret;
  124. }
inet_bind_hash函数在net/ipv4/inet_hashtables.c中实现

点击(此处)折叠或打开

  1. void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
  2.          const unsigned short snum)
  3. {
  4.     struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;

  5.     atomic_inc(&hashinfo->bsockets);
  6.     /*
  7.         在这里将sock的inet_num置为选中的端口。
  8.     */
  9.     inet_sk(sk)->inet_num = snum;
  10.     sk_add_bind_node(sk, &tb->owners);
  11.     tb->num_owners++;
  12.     inet_csk(sk)->icsk_bind_hash = tb;
  13. }

综上的分析,可用看出同一个socket只可以bind最多一个端口,如果bind超过1个端口,会返回错误22(EINVAL)。
阅读(16663) | 评论(0) | 转发(6) |
给主人留下些什么吧!~~