Chinaunix首页 | 论坛 | 博客
  • 博客访问: 1321493
  • 博文数量: 107
  • 博客积分: 10155
  • 博客等级: 上将
  • 技术积分: 2166
  • 用 户 组: 普通用户
  • 注册时间: 2008-03-25 16:57
文章分类

全部博文(107)

文章存档

2010年(1)

2009年(1)

2008年(105)

分类: LINUX

2008-11-05 10:34:32

今天真是不幸,写好的文章竟然在编辑器里被错误冲刷掉了,本来已经完成了本章节的内容,就要在最后关头出现了一个IE错误,现在是重写的,请朋友们见谅。我是无名小卒,请转载的朋友注明出处,谢谢。昨天我们看到了inet_create()函数我们今天继续。

static int inet_create(struct net *net, struct socket *sock, int protocol)
{
    struct sock *sk;
    struct list_head *p;
    struct inet_protosw *answer;
    struct inet_sock *inet;
    struct proto *answer_prot;
    unsigned char answer_flags;
    char answer_no_check;
    int try_loading_module = 0;
    int err;

    if (sock->type != SOCK_RAW &&
     sock->type != SOCK_DGRAM &&
     !inet_ehash_secret)
        build_ehash_secret();

    sock->state = SS_UNCONNECTED;

    /* Look for the requested type/protocol pair. */
    answer = NULL;

这个函数首先是检查是不是原始的socket和udp的socket,并且判断是否已经有了加密字符如果没有就会调用build_ehash_secret来分配一个

void build_ehash_secret(void)
{
    u32 rnd;
    do {
        get_random_bytes(&rnd, sizeof(rnd));
    } while (rnd == 0);
    spin_lock_bh(&inetsw_lock);
    if (!inet_ehash_secret)
        inet_ehash_secret = rnd;
    spin_unlock_bh(&inetsw_lock);
}

get_random_bytes是取得一个随机数即“熵”,取得后赋值给加密字符使用。然后上面的函数中将socket设置为未连接状态。我们继续往下看

lookup_protocol:
    err = -ESOCKTNOSUPPORT;
    rcu_read_lock();
    list_for_each_rcu(p, &inetsw[sock->type]) {
        answer = list_entry(p, struct inet_protosw, list);

        /* Check the non-wild match. */
        if (protocol == answer->protocol) {
            if (protocol != IPPROTO_IP)
                break;
        } else {
            /* Check for the two wild cases. */
            if (IPPROTO_IP == protocol) {
                protocol = answer->protocol;
                break;
            }
            if (IPPROTO_IP == answer->protocol)
                break;
        }
        err = -EPROTONOSUPPORT;
        answer = NULL;
    }

    if (unlikely(answer == NULL)) {
        if (try_loading_module < 2) {
            rcu_read_unlock();
            /*
             * Be more specific, e.g. net-pf-2-proto-132-type-1
             * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
             */

            if (++try_loading_module == 1)
                request_module("net-pf-%d-proto-%d-type-%d",
                     PF_INET, protocol, sock->type);
            /*
             * Fall back to generic, e.g. net-pf-2-proto-132
             * (net-pf-PF_INET-proto-IPPROTO_SCTP)
             */

            else
                request_module("net-pf-%d-proto-%d",
                     PF_INET, protocol);
            goto lookup_protocol;
        } else
            goto out_rcu_unlock;
    }

    err = -EPERM;
    if (answer->capability > 0 && !capable(answer->capability))
        goto out_rcu_unlock;

    err = -EAFNOSUPPORT;
    if (!inet_netns_ok(net, protocol))
        goto out_rcu_unlock;

    sock->ops = answer->ops;
    answer_prot = answer->prot;
    answer_no_check = answer->no_check;
    answer_flags = answer->flags;
    rcu_read_unlock();

    BUG_TRAP(answer_prot->slab != NULL);

    err = -ENOBUFS;
    sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
    if (sk == NULL)
        goto out;

    err = 0;
    sk->sk_no_check = answer_no_check;
    if (INET_PROTOSW_REUSE & answer_flags)
        sk->sk_reuse = 1;

    inet = inet_sk(sk);
    inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

    if (SOCK_RAW == sock->type) {
        inet->num = protocol;
        if (IPPROTO_RAW == protocol)
            inet->hdrincl = 1;
    }

    if (ipv4_config.no_pmtu_disc)
        inet->pmtudisc = IP_PMTUDISC_DONT;
    else
        inet->pmtudisc = IP_PMTUDISC_WANT;

    inet->id = 0;

    sock_init_data(sock, sk);

    sk->sk_destruct     = inet_sock_destruct;
    sk->sk_family     = PF_INET;
    sk->sk_protocol     = protocol;
    sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

    inet->uc_ttl    = -1;
    inet->mc_loop    = 1;
    inet->mc_ttl    = 1;
    inet->mc_index    = 0;
    inet->mc_list    = NULL;

    sk_refcnt_debug_inc(sk);

    if (inet->num) {
        /* It assumes that any protocol which allows
         * the user to assign a number at socket
         * creation time automatically
         * shares.
         */

        inet->sport = htons(inet->num);
        /* Add to protocol hash chains. */
        sk->sk_prot->hash(sk);
    }

    if (sk->sk_prot->init) {
        err = sk->sk_prot->init(sk);
        if (err)
            sk_common_release(sk);
    }
out:
    return err;
out_rcu_unlock:
    rcu_read_unlock();
    goto out;
}

这段代码看似复杂其实分析起来并不算难,首先上面

list_for_each_rcu(p, &inetsw[sock->type])是一个宏,我们看一下

#define list_for_each_rcu(pos, head) \
    for (pos = rcu_dereference((head)->next); \
        prefetch(pos->next), pos != (head); \
        pos = rcu_dereference(pos->next))

这段宏我贴些资料供大家理解,下面这些内容出自

RCU(Read-Copy Update)通过延迟写操作来提高同步性能,具体请参见第3章。这里只分析具有RCU的链表。
RCU常用来保护读操作占多数的链表与数组。具有RCU的链表的操作函数与普通链表操作函数的区别是在函数名后加上了_rcu,如list_for_each_rcu函数。
函数list_for_each_rcu的功能是遍历一个rcu保护的链表。其中,参数pos表示用来做链表位置计数的&struct list_head结构,参数head表示链表头。只要遍历被rcu_read_lock()保护,使用诸如list_add_rcu()的函数对链表同时访问是安全的。
函数List_for_each_rcu列出如下:

#define list_for_each_rcu(pos, head) \
    for (pos = (head)->next, prefetch(pos->next); pos != (head); \
            pos = rcu_dereference(pos->next), prefetch(pos->next))

函数rcu_dereference在RCU读临界部分中取出一个RCU保护的指针。在需要内存屏障的体系中进行内存屏障(目前只有Alpha体系需要),函数列出如下:

#define rcu_dereference(p) ({ \
                typeof(p) _________p1 = p; \
                smp_read_barrier_depends(); \
                (_________p1); \
                })

在include/asm-i386/system.h中:

#define smp_read_barrier_depends()    read_barrier_depends()

很明显上面的宏就是循环检查inetsw数组找到符合我们socket类型的链头,那么这个数组是什么时候初始化的呢?我们再象上一节那样看一下

static int __init inet_init(void)
{
。。。。。。
    for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
        inet_register_protosw(q);
。。。。。。
}

我们看到他循环用inet_register_protosw()函数处理inetsw_array数组中的元素

static struct inet_protosw inetsw_array[] =
{
    {
        .type = SOCK_STREAM,
        .protocol = IPPROTO_TCP,
        .prot = &tcp_prot,
        .ops = &inet_stream_ops,
        .capability = -1,
        .no_check = 0,
        .flags = INET_PROTOSW_PERMANENT |
             INET_PROTOSW_ICSK,
    },

    {
        .type = SOCK_DGRAM,
        .protocol = IPPROTO_UDP,
        .prot = &udp_prot,
        .ops = &inet_dgram_ops,
        .capability = -1,
        .no_check = UDP_CSUM_DEFAULT,
        .flags = INET_PROTOSW_PERMANENT,
       },


       {
     .type = SOCK_RAW,
     .protocol = IPPROTO_IP,    /* wild card */
     .prot = &raw_prot,
     .ops = &inet_sockraw_ops,
     .capability = CAP_NET_RAW,
     .no_check = UDP_CSUM_DEFAULT,
     .flags = INET_PROTOSW_REUSE,
       }
};

我们结合应用程序的练习看一下

server_sockfd = socket(AF_INET, SOCK_STREAM, 0);

这里我们看到在上面的数组中第一个元素就是我们需要的,他是被下面的函数登记到数组中的

void inet_register_protosw(struct inet_protosw *p)
{
    struct list_head *lh;
    struct inet_protosw *answer;
    int protocol = p->protocol;
    struct list_head *last_perm;

    spin_lock_bh(&inetsw_lock);

    if (p->type >= SOCK_MAX)
        goto out_illegal;

    /* If we are trying to override a permanent protocol, bail. */
    answer = NULL;
    last_perm = &inetsw[p->type];
    list_for_each(lh, &inetsw[p->type]) {
        answer = list_entry(lh, struct inet_protosw, list);

        /* Check only the non-wild match. */
        if (INET_PROTOSW_PERMANENT & answer->flags) {
            if (protocol == answer->protocol)
                break;
            last_perm = lh;
        }

        answer = NULL;
    }
    if (answer)
        goto out_permanent;

    /* Add the new entry after the last permanent entry if any, so that
     * the new entry does not override a permanent entry when matched with
     * a wild-card protocol. But it is allowed to override any existing
     * non-permanent entry. This means that when we remove this entry, the
     * system automatically returns to the old behavior.
     */

    list_add_rcu(&p->list, last_perm);
out:
    spin_unlock_bh(&inetsw_lock);

    synchronize_net();

    return;

out_permanent:
    printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
     protocol);
    goto out;

out_illegal:
    printk(KERN_ERR
     "Ignoring attempt to register invalid socket type %d.\n",
     p->type);
    goto out;
}

很明显在上面的循环中找到适合的链头位置,将我们的数组中的元素一一注册登记到数组中。我们这里要看一下inet_protosw结构

struct inet_protosw {
    struct list_head list;

        /* These two fields form the lookup key. */
    unsigned short     type;     /* This is the 2nd argument to socket(2). */
    unsigned short     protocol; /* This is the L4 protocol number. */

    struct proto     *prot;
    const struct proto_ops *ops;
  
    int capability; /* Which (if any) capability do
                 * we need to use this socket
                 * interface?
                                      */

    char no_check; /* checksum on rcv/xmit/none? */
    unsigned char     flags; /* See INET_PROTOSW_* below. */
};

这个结构是专门用于采用IP协议的socket使用。其内部的变量我们暂时不做分析,也不转译了,我们还是坚持“用时学习”的观念。回到inet_create()函数中,我们已经在inetsw数组中找到了我们的协议类型的链头就会取得其宿主inet_protosw 结构,然后answer = list_entry(p, struct inet_protosw, list);接着函数中对其兼容性进行了检测,最关键的地方是

    sock->ops = answer->ops;
    answer_prot = answer->prot;

这二句首先是为socket的协议操作函数进行了挂钩,我们就要看上面的

    {
        .type = SOCK_STREAM,
        .protocol = IPPROTO_TCP,
        .prot = &tcp_prot,
        .ops = &inet_stream_ops,
        .capability = -1,
        .no_check = 0,
        .flags = INET_PROTOSW_PERMANENT |
             INET_PROTOSW_ICSK,
    },

结合这个元素的设置我们明白了,上面socket的协议操作函数被设置成了inet_stream_ops(),而answer_prot设置成了tcp_prot结构。这是个struct proto结构,我们不看了,其内容很多,但是这个结构的作用得强调一下,它是专门用于socket的传输层使用的结构,而用于网络传输层的结构由另一个结构体来表示struct inet_proto。接着函数中分配了一个sock结构。我是无名小卒,尽管3月份才写博客其实研究内核很多年了,写这些博客是为了与朋友们共享知识发扬copyleft精神,所以请转载的朋友注明出处。分配函数我们曾经在unixsocket创建过程中谈到过,我们不细细研究这个函数了,不过要注意其内部的关键地方sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);我们看到他传递了一个answer_prot即我们说的用于socket传输层的钩子函数给sk_alloc

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
         struct proto *prot)
{
    struct sock *sk;

    sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
    if (sk) {
        sk->sk_family = family;
        /*
         * See comment in struct sock definition to understand
         * why we need sk_prot_creator -acme
         */

        sk->sk_prot = sk->sk_prot_creator = prot;
        sock_lock_init(sk);
        sock_net_set(sk, get_net(net));
    }

    return sk;
}

注意上面sk->sk_prot = sk->sk_prot_creator = prot;将传输层的钩子函数挂入到了sock中了。再回到inet_create函数中,代码是一些对sock的初始化,注意sksock结构,而socksocket结构。使用inet = inet_sk(sk);使sk赋值给struct inet_sock *inet结构变量,我们在unixsocket中曾经说了关于unix socketunix_sock,而这里用于INETsocket结构inet_sock。然后函数通过sock_init_data(sock, sk);sk进一步的初始化操作,并让socksk挂起钩来,这个函数我们已经在那些unix socket创建文章中分析过了,请不明白的朋友们看那里的文章学习。下面函数中最关键的地方

    if (sk->sk_prot->init) {
        err = sk->sk_prot->init(sk);

这个就是调用了我们上面提到的传输层结构中的钩子函数init.我们看到上面是tcp_prot结构变量。所以进入其init函数中

struct proto tcp_prot = {
    .name            = "TCP",
    .owner            = THIS_MODULE,
    .close            = tcp_close,
    .connect        = tcp_v4_connect,
    .disconnect        = tcp_disconnect,
    .accept            = inet_csk_accept,
    .ioctl            = tcp_ioctl,
    .init            = tcp_v4_init_sock,
    .destroy        = tcp_v4_destroy_sock,
    .shutdown        = tcp_shutdown,
    .setsockopt        = tcp_setsockopt,
    .getsockopt        = tcp_getsockopt,
    .recvmsg        = tcp_recvmsg,
    .backlog_rcv        = tcp_v4_do_rcv,
    .hash            = inet_hash,
    .unhash            = inet_unhash,
    .get_port        = inet_csk_get_port,
    .enter_memory_pressure    = tcp_enter_memory_pressure,
    .sockets_allocated    = &tcp_sockets_allocated,
    .orphan_count        = &tcp_orphan_count,
    .memory_allocated    = &tcp_memory_allocated,
    .memory_pressure    = &tcp_memory_pressure,
    .sysctl_mem        = sysctl_tcp_mem,
    .sysctl_wmem        = sysctl_tcp_wmem,
    .sysctl_rmem        = sysctl_tcp_rmem,
    .max_header        = MAX_TCP_HEADER,
    .obj_size        = sizeof(struct tcp_sock),
    .twsk_prot        = &tcp_timewait_sock_ops,
    .rsk_prot        = &tcp_request_sock_ops,
    .h.hashinfo        = &tcp_hashinfo,
#ifdef CONFIG_COMPAT
    .compat_setsockopt    = compat_tcp_setsockopt,
    .compat_getsockopt    = compat_tcp_getsockopt,
#endif
};

我是无名小卒,转载请注明出处,不要担心结构有多么大,很多朋友在看代码时总是被结构所吓倒,要知道一个好的结构体也不是一天所成更不是一人所完成的,所以我们要针对场景来分析记忆。其实上面我们就只关心一个地方.init = tcp_v4_init_sock,,进入钩子函数

static int tcp_v4_init_sock(struct sock *sk)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
    struct tcp_sock *tp = tcp_sk(sk);

    skb_queue_head_init(&tp->out_of_order_queue);
    tcp_init_xmit_timers(sk);
    tcp_prequeue_init(tp);

    icsk->icsk_rto = TCP_TIMEOUT_INIT;
    tp->mdev = TCP_TIMEOUT_INIT;

    /* So many TCP implementations out there (incorrectly) count the
     * initial SYN frame in their delayed-ACK and congestion control
     * algorithms that we must have the following bandaid to talk
     * efficiently to them. -DaveM
     */

    tp->snd_cwnd = 2;

    /* See draft-stevens-tcpca-spec-01 for discussion of the
     * initialization of these values.
     */

    tp->snd_ssthresh = 0x7fffffff;    /* Infinity */
    tp->snd_cwnd_clamp = ~0;
    tp->mss_cache = 536;

    tp->reordering = sysctl_tcp_reordering;
    icsk->icsk_ca_ops = &tcp_init_congestion_ops;

    sk->sk_state = TCP_CLOSE;

    sk->sk_write_space = sk_stream_write_space;
    sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);

    icsk->icsk_af_ops = &ipv4_specific;
    icsk->icsk_sync_mss = tcp_sync_mss;
#ifdef CONFIG_TCP_MD5SIG
    tp->af_specific = &tcp_sock_ipv4_specific;
#endif

    sk->sk_sndbuf = sysctl_tcp_wmem[1];
    sk->sk_rcvbuf = sysctl_tcp_rmem[1];

    atomic_inc(&tcp_sockets_allocated);

    return 0;
}

首先函数出现了一个新的结构体struct tcp_sock,这个结构非常大,但是其作用随着我们的分析会越来越清晰,上面代码中对这个结构变量tpsock结构变量sk进一步的初始化,对后这个地方的初始化工作我们要经常回顾,例如这里的sk->sk_write_space = sk_stream_write_space等钩子函数的挂入,以及缓冲区的相关设置。最后我们回到上一节提到的sys_socket()函数中执行retval = sock_map_fd(sock)来完成创建过程,sock_map_fd我们在unixsocket中详细分析了,他在文件系统中分配一个文件号,以及file文件指针和目录项dentry结构使其与socket挂上钩,最后返回分配的文件号,至此以后我们可以根据这个文件号对创建的socket进行其他操作了。相关内容看unixsocket创建http://blog.chinaunix.net/u2/64681/showart_1300200.html

阅读(5180) | 评论(1) | 转发(0) |
给主人留下些什么吧!~~

chinaunix网友2008-11-18 10:43:06

sk_stream_write_space 的作用是啥.