memcached探索之thread model(1)-itlanger-ChinaUnix博客

itlanger's blogitlanger.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

itlanger

博客访问： 659190
博文数量： 227
博客积分： 8017
博客等级：中将
技术积分： 2069
用户组：普通用户
注册时间： 2007-12-08 22:50

文章分类

全部博文（227）

Taiji（4）
C/C++（14）
Storage（11）
Misc（6）
Algorithm（6）
Embeded（30）
System（30）
Kernel（32）
Network（19）
Coding（36）
Emacs（6）
Mood（24）
Scripts（9）
未分配的博文（0）

文章存档

2011年（10）

2010年（55）

2009年（28）

2008年（134）

我的朋友

相关博文

memcached探索之thread model(1)

分类：服务器与存储

2010-09-19 22:44:10

memcached利用libevent实现了一个线程池。有一个dispatch_thread和N个worker_thread构成。
本节分析memcached的线程池模型

1. main函数的中线程初始化

static struct event_base *main_base;

/* initialize main thread libevent instance */ main_base = event_init(); .... /* start up worker threads if MT mode */ thread_init(settings.num_threads, main_base);

2. thread.c中的thread_init, 不过先看看connection_queue吧。

#define ITEMS_PER_ALLOC 64



/* An item in the connection queue. */

typedef struct conn_queue_item CQ_ITEM;

struct conn_queue_item {

    int               sfd;

    enum conn_states  init_state;

    int               event_flags;

    int               read_buffer_size;

    enum network_transport     transport;

    CQ_ITEM          *next;

};



/* A connection queue. */
每个工作线程都有一个connection queue, CQ中的每个CQ_ITEM都是对一个socket连接的简单描述

typedef struct conn_queue CQ; 

struct conn_queue {

    CQ_ITEM *head;

    CQ_ITEM *tail;

    pthread_mutex_t lock;

    pthread_cond_t  cond;

};



/* Lock for cache operations (item_*, assoc_*) */

pthread_mutex_t cache_lock;



/* Connection lock around accepting new connections */

pthread_mutex_t conn_lock = PTHREAD_MUTEX_INITIALIZER;



/* Lock for global stats */

static pthread_mutex_t stats_lock;



/* Free list of CQ_ITEM structs */

static CQ_ITEM *cqi_freelist;

static pthread_mutex_t cqi_freelist_lock;

/*



 typedef struct { //定义在memcached.h中
    pthread_t thread_id;        /* unique ID of this thread */
    struct event_base *base;    /* libevent handle this thread uses */
} LIBEVENT_DISPATCHER_THREAD;

*/


static LIBEVENT_DISPATCHER_THREAD dispatcher_thread;



/*

 * Each libevent instance has a wakeup pipe, which other threads

 * can use to signal that they've put a new connection on its queue.

typedef struct { //memcached.h中定义
    pthread_t thread_id;        /* unique ID of this thread */
    struct event_base *base;    /* libevent handle this thread uses */
    struct event notify_event;  /* listen event for notify pipe */
    int notify_receive_fd;      /* receiving end of notify pipe */
    int notify_send_fd;         /* sending end of notify pipe */
    struct thread_stats stats;  /* Stats generated by this thread */
    struct conn_queue *new_conn_queue; /* queue of new connections to handle */
    cache_t *suffix_cache;      /* suffix cache */
} LIBEVENT_THREAD;

 */

static LIBEVENT_THREAD *threads;



/*

 * Number of worker threads that have finished setting themselves up.

 */

static int init_count = 0;

static pthread_mutex_t init_lock;

static pthread_cond_t init_cond;

//下面是CQ的一些操作。。。


/* 

 * Initializes a connection queue.

 */

static void cq_init(CQ *cq) {

    pthread_mutex_init(&cq->lock, NULL);

    pthread_cond_init(&cq->cond, NULL);

    cq->head = NULL;

    cq->tail = NULL;

}



/*

 * Looks for an item on a connection queue, but doesn't block if there isn't

 * one.

 * Returns the item, or NULL if no item is available

 */

static CQ_ITEM *cq_pop(CQ *cq) {

    CQ_ITEM *item;



    pthread_mutex_lock(&cq->lock);

    item = cq->head;

    if (NULL != item) {

        cq->head = item->next;

        if (NULL == cq->head)

            cq->tail = NULL;

    }

    pthread_mutex_unlock(&cq->lock);



    return item;

}



/*

 * Adds an item to a connection queue.

 */

static void cq_push(CQ *cq, CQ_ITEM *item) {

    item->next = NULL;



    pthread_mutex_lock(&cq->lock);

    if (NULL == cq->tail)

        cq->head = item;

    else

        cq->tail->next = item;

    cq->tail = item;

    pthread_cond_signal(&cq->cond);

    pthread_mutex_unlock(&cq->lock);

}



/*

 * Returns a fresh connection queue item.

 */

static CQ_ITEM *cqi_new(void) {

    CQ_ITEM *item = NULL;

    pthread_mutex_lock(&cqi_freelist_lock);

    if (cqi_freelist) {

        item = cqi_freelist;

        cqi_freelist = item->next;

    }

    pthread_mutex_unlock(&cqi_freelist_lock);



    if (NULL == item) {

        int i;



        /* Allocate a bunch of items at once to reduce fragmentation */

        item = malloc(sizeof(CQ_ITEM) * ITEMS_PER_ALLOC);

        if (NULL == item)

            return NULL;



        /*

         * Link together all the new items except the first one

         * (which we'll return to the caller) for placement on

         * the freelist.

         */

        for (i = 2; i < ITEMS_PER_ALLOC; i++)

            item[i - 1].next = &item[i];



        pthread_mutex_lock(&cqi_freelist_lock);

        item[ITEMS_PER_ALLOC - 1].next = cqi_freelist;

        cqi_freelist = &item[1];

        pthread_mutex_unlock(&cqi_freelist_lock);

    }



    return item;

}

/*

 * Frees a connection queue item (adds it to the freelist.)

 */

static void cqi_free(CQ_ITEM *item) {

    pthread_mutex_lock(&cqi_freelist_lock);

    item->next = cqi_freelist;

    cqi_freelist = item;

    pthread_mutex_unlock(&cqi_freelist_lock);

}

/* * Initializes the thread subsystem, creating various worker threads. * * nthreads Number of worker event handler threads to spawn * main_base Event base for main thread */ void thread_init(int nthreads, struct event_base *main_base) { int i; pthread_mutex_init(&cache_lock, NULL); pthread_mutex_init(&stats_lock, NULL); pthread_mutex_init(&init_lock, NULL); pthread_cond_init(&init_cond, NULL); pthread_mutex_init(&cqi_freelist_lock, NULL); cqi_freelist = NULL; threads = calloc(nthreads, sizeof(LIBEVENT_THREAD)); if (! threads) { perror("Can't allocate thread descriptors"); exit(1); } dispatcher_thread.base = main_base; // 指定dispatcher_thread的event_base dispatcher_thread.thread_id = pthread_self(); for (i = 0; i < nthreads; i++) { int fds[2]; if (pipe(fds)) { //对每个工作线程创建一个管道，用以通知其消息。 perror("Can't create notify pipe"); exit(1); } threads[i].notify_receive_fd = fds[0]; threads[i].notify_send_fd = fds[1]; setup_thread(&threads[i]); } /* Create threads after we've done all the libevent setup. */ for (i = 0; i < nthreads; i++) { create_worker(worker_libevent, &threads[i]); } /* Wait for all the threads to set themselves up before returning. */ pthread_mutex_lock(&init_lock); while (init_count < nthreads) { pthread_cond_wait(&init_cond, &init_lock); } pthread_mutex_unlock(&init_lock); }

//接着看看setup_thread和create_worker

static void setup_thread(LIBEVENT_THREAD *me) {     me->base = event_init(); 初始化每一个worker_thread的event_base     if (! me->base) {         fprintf(stderr, "Can't allocate event base\n");         exit(1);     }    //设置监听事件，在管道可读时调用thread_event_process函数
    /* Listen for notifications from other threads */     event_set(&me->notify_event, me->notify_receive_fd,               EV_READ | EV_PERSIST, thread_libevent_process, me);     event_base_set(me->base, &me->notify_event);     if (event_add(&me->notify_event, 0) == -1) { //no timeout         fprintf(stderr, "Can't monitor libevent notify pipe\n");         exit(1);     }   //每个worker_thread一个conn_queue
    me->new_conn_queue = malloc(sizeof(struct conn_queue));     if (me->new_conn_queue == NULL) {         perror("Failed to allocate memory for connection queue");         exit(EXIT_FAILURE);     }     cq_init(me->new_conn_queue);     if (pthread_mutex_init(&me->stats.mutex, NULL) != 0) {         perror("Failed to initialize mutex");         exit(EXIT_FAILURE);     }    //suffix_cache是干什么用的呢？
    me->suffix_cache = cache_create("suffix", SUFFIX_SIZE, sizeof(char*),                                     NULL, NULL);     if (me->suffix_cache == NULL) {         fprintf(stderr, "Failed to create suffix cache\n");         exit(EXIT_FAILURE);     } }

//仅仅是调用pthread_create创建线程。。。 static void create_worker(void *(*func)(void *), void *arg) { pthread_t thread; pthread_attr_t attr; int ret; pthread_attr_init(&attr); if ((ret = pthread_create(&thread, &attr, func, arg)) != 0) { fprintf(stderr, "Can't create thread: %s\n", strerror(ret)); exit(1); } }

3. 下面继续跟踪，看看每个worker_thread注册的thread_libevent_process函数

static void thread_libevent_process(int fd, short which, void *arg) { LIBEVENT_THREAD *me = arg; CQ_ITEM *item; char buf[1]; //读一下管道。表示处理事件，而管道的另一端是由dispatcher_thread写的
if (read(fd, buf, 1) != 1) if (settings.verbose > 0) fprintf(stderr, "Can't read from libevent pipe\n"); item = cq_pop(me->new_conn_queue); //pop出一个CQ_ITEM if (NULL != item) { //建立一个connection对象 conn *c = conn_new(item->sfd, item->init_state, item->event_flags, item->read_buffer_size, item->transport, me->base); if (c == NULL) { if (IS_UDP(item->transport)) { fprintf(stderr, "Can't listen for events on UDP socket\n"); exit(1); } else { if (settings.verbose > 0) { fprintf(stderr, "Can't listen for events on fd %d\n", item->sfd); } close(item->sfd); } } else { c->thread = me; //创建conn对象成功后，将该工作线程结构体链接到conn对象中。 } cqi_free(item); } }

4. 继续conn的一些管理函数, 尤其是conn_new

/**

 * The structure representing a connection into memcached. 定义在memcached.h中

 */

typedef struct conn conn;

struct conn {

    int    sfd;

    sasl_conn_t *sasl_conn;

    enum conn_states  state;

    enum bin_substates substate;

    struct event event;

    short  ev_flags;

    short  which;   /** which events were just triggered */



    char   *rbuf;   /** buffer to read commands into */

    char   *rcurr;  /** but if we parsed some already, this is where we stopped */

    int    rsize;   /** total allocated size of rbuf */

    int    rbytes;  /** how much data, starting from rcur, do we have unparsed */



    char   *wbuf;

    char   *wcurr;

    int    wsize;

    int    wbytes;

    /** which state to go into after finishing current write */

    enum conn_states  write_and_go;

    void   *write_and_free; /** free this memory after finishing writing */



    char   *ritem;  /** when we read in an item's value, it goes here */

    int    rlbytes;



    /* data for the nread state */



    /**

     * item is used to hold an item structure created after reading the command

     * line of set/add/replace commands, but before we finished reading the actual

     * data. The data is read into ITEM_data(item) to avoid extra copying.

     */



    void   *item;     /* for commands set/add/replace  */



    /* data for the swallow state */

    int    sbytes;    /* how many bytes to swallow */



    /* data for the mwrite state */

    struct iovec *iov;

    int    iovsize;   /* number of elements allocated in iov[] */

    int    iovused;   /* number of elements used in iov[] */



    struct msghdr *msglist;

    int    msgsize;   /* number of elements allocated in msglist[] */

    int    msgused;   /* number of elements used in msglist[] */

    int    msgcurr;   /* element in msglist[] being transmitted now */

    int    msgbytes;  /* number of bytes in current msg */



    item   **ilist;   /* list of items to write out */

    int    isize;

    item   **icurr;

    int    ileft;



    char   **suffixlist;

    int    suffixsize;

    char   **suffixcurr;

    int    suffixleft;



    enum protocol protocol;   /* which protocol this connection speaks */

    enum network_transport transport; /* what transport is used by this connection */



    /* data for UDP clients */

    int    request_id; /* Incoming UDP request ID, if this is a UDP "connection" */

    struct sockaddr request_addr; /* Who sent the most recent request */

    socklen_t request_addr_size;

    unsigned char *hdrbuf; /* udp packet headers */

    int    hdrsize;   /* number of headers' worth of space is allocated */



    bool   noreply;   /* True if the reply should not be sent. */

    /* current stats command */

    struct {

        char *buffer;

        size_t size;

        size_t offset;

    } stats;



    /* Binary protocol stuff */

    /* This is where the binary header goes */

    protocol_binary_request_header binary_header;

    uint64_t cas; /* the cas to return */

    short cmd; /* current command being processed */

    int opaque;

    int keylen;

    conn   *next;     /* Used for generating a list of conn structures */

    LIBEVENT_THREAD *thread; /* Pointer to the thread object serving this connection */

};



/*

 * Free list management for connections. thread.h中

 */

static conn **freeconns;   //conn的freelist

static int freetotal;

static int freecurr;

/* Lock for connection freelist */

static pthread_mutex_t conn_lock = PTHREAD_MUTEX_INITIALIZER;



static void conn_init(void) {

    freetotal = 200;

    freecurr = 0;

    if ((freeconns = calloc(freetotal, sizeof(conn *))) == NULL) {

        fprintf(stderr, "Failed to allocate connection structures\n");

    }

    return;

}

/* * Adds a connection to the freelist. 0 = success. */ bool conn_add_to_freelist(conn *c) { bool ret = true; pthread_mutex_lock(&conn_lock); if (freecurr < freetotal) { freeconns[freecurr++] = c; ret = false; } else { //否则需要增长链表长度, memcached中好多个地方都是这样增长list /* try to enlarge free connections array */ size_t newsize = freetotal * 2; conn **new_freeconns = realloc(freeconns, sizeof(conn *) * newsize); if (new_freeconns) { freetotal = newsize; freeconns = new_freeconns; freeconns[freecurr++] = c; ret = false; } } pthread_mutex_unlock(&conn_lock); return ret; } static const char *prot_text(enum protocol prot) { char *rv = "unknown"; switch(prot) { case ascii_prot: rv = "ascii"; break; case binary_prot: rv = "binary"; break; case negotiating_prot: rv = "auto-negotiate"; break; } return rv; }

//核心函数之一，每个worker_pthread在get一个CQ_ITEM都都会调用。。。

conn *conn_new(const int sfd, enum conn_states init_state, const int event_flags, const int read_buffer_size, enum network_transport transport, struct event_base *base) { conn *c = conn_from_freelist(); if (NULL == c) { if (!(c = (conn *)calloc(1, sizeof(conn)))) { fprintf(stderr, "calloc()\n"); return NULL; } MEMCACHED_CONN_CREATE(c); c->rbuf = c->wbuf = 0; c->ilist = 0; c->suffixlist = 0; c->iov = 0; c->msglist = 0; c->hdrbuf = 0; c->rsize = read_buffer_size; c->wsize = DATA_BUFFER_SIZE; //2048 c->isize = ITEM_LIST_INITIAL; //200 c->suffixsize = SUFFIX_LIST_INITIAL; //20 c->iovsize = IOV_LIST_INITIAL; //400 c->msgsize = MSG_LIST_INITIAL; //10 c->hdrsize = 0; c->rbuf = (char *)malloc((size_t)c->rsize); c->wbuf = (char *)malloc((size_t)c->wsize); c->ilist = (item **)malloc(sizeof(item *) * c->isize); c->suffixlist = (char **)malloc(sizeof(char *) * c->suffixsize); c->iov = (struct iovec *)malloc(sizeof(struct iovec) * c->iovsize); c->msglist = (struct msghdr *)malloc(sizeof(struct msghdr) * c->msgsize); if (c->rbuf == 0 || c->wbuf == 0 || c->ilist == 0 || c->iov == 0 || c->msglist == 0 || c->suffixlist == 0) { conn_free(c); fprintf(stderr, "malloc()\n"); return NULL; } STATS_LOCK(); stats.conn_structs++; STATS_UNLOCK(); }

    /* 定义在memcached.h中

    enum network_transport {
         local_transport, /* Unix sockets*/
         tcp_transport,
         udp_transport

    };*/

c->transport = transport;

c->protocol = settings.binding_protocol; /* unix socket mode doesn't need this, so zeroed out. but why * is this done for every command? presumably for UDP * mode. */ if (!settings.socketpath) { c->request_addr_size = sizeof(c->request_addr); } else { c->request_addr_size = 0; } if (settings.verbose > 1) { if (init_state == conn_listening) { fprintf(stderr, "<%d server listening (%s)\n", sfd, prot_text(c->protocol)); } else if (IS_UDP(transport)) { fprintf(stderr, "<%d server listening (udp)\n", sfd); } else if (c->protocol == negotiating_prot) { fprintf(stderr, "<%d new auto-negotiating client connection\n", sfd); } else if (c->protocol == ascii_prot) { fprintf(stderr, "<%d new ascii client connection.\n", sfd); } else if (c->protocol == binary_prot) { fprintf(stderr, "<%d new binary client connection.\n", sfd); } else { fprintf(stderr, "<%d new unknown (%d) client connection\n", sfd, c->protocol); assert(false); } } c->sfd = sfd; c->state = init_state; c->rlbytes = 0; c->cmd = -1; c->rbytes = c->wbytes = 0; c->wcurr = c->wbuf; c->rcurr = c->rbuf; c->ritem = 0; c->icurr = c->ilist; c->suffixcurr = c->suffixlist; c->ileft = 0; c->suffixleft = 0; c->iovused = 0; c->msgcurr = 0; c->msgused = 0; c->write_and_go = init_state; c->write_and_free = 0; c->item = 0; c->noreply = false; //将本connection的thread注册到worker_thread的event_base中，event_flag来源于CQ_ITEM的event_flag, 每个connection事件发生时会回调event_handler函数 event_set(&c->event, sfd, event_flags, event_handler, (void *)c); event_base_set(base, &c->event); c->ev_flags = event_flags; if (event_add(&c->event, 0) == -1) { //never timeout if (conn_add_to_freelist(c)) { conn_free(c); } perror("event_add"); return NULL; } STATS_LOCK(); stats.curr_conns++; stats.total_conns++; STATS_UNLOCK(); MEMCACHED_CONN_ALLOCATE(c->sfd); return c; } static void conn_cleanup(conn *c) { assert(c != NULL); if (c->item) { item_remove(c->item); c->item = 0; } if (c->ileft != 0) { for (; c->ileft > 0; c->ileft--,c->icurr++) { item_remove(*(c->icurr)); } } if (c->suffixleft != 0) { for (; c->suffixleft > 0; c->suffixleft--, c->suffixcurr++) { cache_free(c->thread->suffix_cache, *(c->suffixcurr)); } } if (c->write_and_free) { free(c->write_and_free); c->write_and_free = 0; } if (c->sasl_conn) { assert(settings.sasl); sasl_dispose(&c->sasl_conn); c->sasl_conn = NULL; } } /* * Frees a connection. */ void conn_free(conn *c) { if (c) { MEMCACHED_CONN_DESTROY(c); if (c->hdrbuf) free(c->hdrbuf); if (c->msglist) free(c->msglist); if (c->rbuf) free(c->rbuf); if (c->wbuf) free(c->wbuf); if (c->ilist) free(c->ilist); if (c->suffixlist) free(c->suffixlist); if (c->iov) free(c->iov); free(c); } } static void conn_close(conn *c) { assert(c != NULL); /* delete the event, the socket and the conn */ event_del(&c->event); if (settings.verbose > 1) fprintf(stderr, "<%d connection closed.\n", c->sfd); MEMCACHED_CONN_RELEASE(c->sfd); close(c->sfd); accept_new_conns(true); conn_cleanup(c); /* if the connection has big buffers, just free it */ if (c->rsize > READ_BUFFER_HIGHWAT || conn_add_to_freelist(c)) { conn_free(c); } STATS_LOCK(); stats.curr_conns--; STATS_UNLOCK(); return; } /* * Shrinks a connection's buffers if they're too big. This prevents * periodic large "get" requests from permanently chewing lots of server * memory. * * This should only be called in between requests since it can wipe output * buffers! */ static void conn_shrink(conn *c) { assert(c != NULL); if (IS_UDP(c->transport)) return; if (c->rsize > READ_BUFFER_HIGHWAT && c->rbytes < DATA_BUFFER_SIZE) { char *newbuf; if (c->rcurr != c->rbuf) memmove(c->rbuf, c->rcurr, (size_t)c->rbytes); newbuf = (char *)realloc((void *)c->rbuf, DATA_BUFFER_SIZE); if (newbuf) { c->rbuf = newbuf; c->rsize = DATA_BUFFER_SIZE; } /* TODO check other branch... */ c->rcurr = c->rbuf; } if (c->isize > ITEM_LIST_HIGHWAT) { item **newbuf = (item**) realloc((void *)c->ilist, ITEM_LIST_INITIAL * sizeof(c->ilist[0])); if (newbuf) { c->ilist = newbuf; c->isize = ITEM_LIST_INITIAL; } /* TODO check error condition? */ } if (c->msgsize > MSG_LIST_HIGHWAT) { struct msghdr *newbuf = (struct msghdr *) realloc((void *)c->msglist, MSG_LIST_INITIAL * sizeof(c->msglist[0])); if (newbuf) { c->msglist = newbuf; c->msgsize = MSG_LIST_INITIAL; } /* TODO check error condition? */ } if (c->iovsize > IOV_LIST_HIGHWAT) { struct iovec *newbuf = (struct iovec *) realloc((void *)c->iov, IOV_LIST_INITIAL * sizeof(c->iov[0])); if (newbuf) { c->iov = newbuf; c->iovsize = IOV_LIST_INITIAL; } /* TODO check return value */ } }

5. 看看在conn_new中注册到worker_thread的event_base的conn_event事件处理函数event_handler

void event_handler(const int fd, const short which, void *arg) { conn *c; c = (conn *)arg; assert(c != NULL); c->which = which; /* sanity */ if (fd != c->sfd) { if (settings.verbose > 0) fprintf(stderr, "Catastrophic: event fd doesn't match conn fd!\n"); conn_close(c); return; } drive_machine(c); //最终调用一个状态机处理函数。这个比较复杂，下篇再继续 /* wait for next event */ return; }

阅读(1753) | 评论(0) | 转发(0) |

上一篇：robots协议

下一篇：HTTP协议

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6