Chinaunix首页 | 论坛 | 博客
  • 博客访问: 492894
  • 博文数量: 138
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 716
  • 用 户 组: 普通用户
  • 注册时间: 2015-03-03 21:48
文章分类

全部博文(138)

文章存档

2019年(1)

2017年(5)

2016年(99)

2015年(33)

我的朋友

分类:

2016-01-20 10:14:49

原文地址:AC算法实现 作者:CUDev

这几天一直都在看多模式匹配的算法,昨天为了赶着能够在信息检索的课上show一下,就赶工写了一个AC算法的程序。实际上程序大部分代码来自于Snort,我做了一定的修改,将NFA和转化为DFA的函数合并到了一块,并且删除了很多冗余的变量和函数,感觉Snort代码写的有些乱,自我感觉我的代码应该比snort的效率和可读性都要高出一些。
    我写的程序能够打印出每一个匹配到的关键词的位置,并且对关键词出现的次数进行统计。
下面是对gpl文件以free、software和copy为关键词进行的搜索。
Search the gpl.txt
Match KeyWord COPY at 4 line 1 char
Match KeyWord FREE at 4 line 26 char
Match KeyWord SOFTWARE at 4 line 31 char
Match KeyWord COPY at 6 line 26 char
Match KeyWord SOFTWARE at 11 line 24 char
Match KeyWord FREE at 12 line 0 char
Match KeyWord FREE at 13 line 38 char
Match KeyWord FREE at 13 line 66 char
Match KeyWord SOFTWARE at 14 line 0 char
Match KeyWord SOFTWARE at 14 line 27 char
Match KeyWord FREE at 14 line 39 char
Match KeyWord FREE at 15 line 46 char
Match KeyWord SOFTWARE at 15 line 51 char
Match KeyWord SOFTWARE at 16 line 13 char
Match KeyWord FREE at 17 line 23 char
Match KeyWord SOFTWARE at 17 line 28 char
Match KeyWord SOFTWARE at 17 line 48 char
Match KeyWord FREE at 21 line 19 char
Match KeyWord SOFTWARE at 21 line 24 char
Match KeyWord FREE at 21 line 54 char
Match KeyWord FREE at 23 line 9 char
Match KeyWord FREE at 23 line 41 char
Match KeyWord SOFTWARE at 23 line 46 char
Match KeyWord SOFTWARE at 25 line 40 char
Match KeyWord FREE at 26 line 7 char
Match KeyWord SOFTWARE at 31 line 25 char
Match KeyWord COPY at 39 line 45 char
Match KeyWord SOFTWARE at 39 line 59 char
Match KeyWord COPY at 40 line 63 char
Match KeyWord SOFTWARE at 41 line 29 char
Match KeyWord FREE at 44 line 61 char
Match KeyWord SOFTWARE at 45 line 0 char
Match KeyWord SOFTWARE at 45 line 18 char
Match KeyWord FREE at 50 line 15 char
Match KeyWord SOFTWARE at 50 line 56 char
Match KeyWord FREE at 51 line 63 char
Match KeyWord FREE at 54 line 39 char
Match KeyWord COPY at 56 line 39 char
Match KeyWord COPY at 60 line 28 char
Match KeyWord COPY at 63 line 23 char
Match KeyWord COPY at 66 line 54 char
Match KeyWord COPY at 72 line 22 char
Match KeyWord COPY at 79 line 13 char
Match KeyWord COPY at 81 line 48 char
Match KeyWord COPY at 82 line 0 char
Match KeyWord COPY at 84 line 47 char
Match KeyWord COPY at 87 line 60 char
Match KeyWord COPY at 90 line 25 char
Match KeyWord COPY at 91 line 53 char
Match KeyWord COPY at 106 line 42 char
Match KeyWord COPY at 109 line 57 char
Match KeyWord COPY at 134 line 13 char
Match KeyWord SOFTWARE at 140 line 51 char
Match KeyWord COPY at 145 line 21 char
Match KeyWord SOFTWARE at 147 line 25 char
Match KeyWord COPY at 167 line 10 char
Match KeyWord COPY at 168 line 10 char
Match KeyWord COPY at 170 line 13 char
Match KeyWord COPY at 172 line 17 char
Match KeyWord COPY at 174 line 13 char
Match KeyWord COPY at 186 line 33 char
Match KeyWord COPY at 191 line 21 char
Match KeyWord FREE at 205 line 33 char
Match KeyWord FREE at 218 line 17 char
Match KeyWord SOFTWARE at 218 line 22 char
Match KeyWord SOFTWARE at 220 line 44 char
Match KeyWord SOFTWARE at 223 line 14 char
Match KeyWord COPY at 230 line 42 char
Match KeyWord COPY at 231 line 9 char
Match KeyWord FREE at 237 line 9 char
Match KeyWord SOFTWARE at 237 line 14 char
Match KeyWord FREE at 245 line 64 char
Match KeyWord SOFTWARE at 246 line 0 char
Match KeyWord FREE at 247 line 63 char
Match KeyWord SOFTWARE at 247 line 68 char
Match KeyWord FREE at 250 line 65 char
Match KeyWord SOFTWARE at 252 line 28 char
Match KeyWord COPY at 252 line 46 char
Match KeyWord FREE at 252 line 65 char
Match KeyWord SOFTWARE at 253 line 0 char
Match KeyWord FREE at 253 line 34 char
Match KeyWord SOFTWARE at 253 line 39 char
Match KeyWord FREE at 255 line 18 char
Match KeyWord FREE at 255 line 56 char
Match KeyWord SOFTWARE at 255 line 61 char
Match KeyWord SOFTWARE at 256 line 38 char
Match KeyWord FREE at 260 line 38 char
Match KeyWord COPY at 262 line 32 char
Match KeyWord COPY at 271 line 9 char
Match KeyWord FREE at 286 line 0 char
Match KeyWord SOFTWARE at 286 line 5 char
Match KeyWord COPY at 291 line 5 char
Match KeyWord COPY at 294 line 4 char
Match KeyWord FREE at 296 line 20 char
Match KeyWord SOFTWARE at 296 line 25 char
Match KeyWord FREE at 298 line 8 char
Match KeyWord SOFTWARE at 298 line 13 char
Match KeyWord COPY at 306 line 31 char
Match KeyWord FREE at 307 line 44 char
Match KeyWord SOFTWARE at 307 line 49 char
Match KeyWord COPY at 315 line 28 char
Match KeyWord FREE at 317 line 12 char
Match KeyWord SOFTWARE at 317 line 17 char
Match KeyWord COPY at 326 line 27 char
Match KeyWord COPY at 329 line 39 char

### Summary ###
        FREE :    32
    SOFTWARE :    35
        COPY :    38

entry.c
#include "acsmx.h"

/*
*  Text Data Buffer
*/
unsigned char text[MAXLEN];
extern int nline;

int main (int argc, char **argv)
{
    int i, nocase = 0;
    FILE *fd;
    char filename[20];
    ACSM_STRUCT * acsm;

    if (argc < 3)
    {
        fprintf (stderr,"Usage: acsmx filename pattern1 pattern2 ...  -nocase\n");
        exit (0);
    }

    acsm = acsmNew ();

    strcpy (filename, argv[1]);
    fd = fopen(filename,"r");
    if(fd == NULL)
    {
        fprintf(stderr,"Open file error!\n");
        exit(1);
    }

    for (i = 1; i < argc; i++)
        if (strcmp (argv[i], "-nocase") == 0)
            nocase = 1;
    for (i = 2; i < argc; i++)
    {
        if (argv[i][0] == '-')
            continue;
        acsmAddPattern (acsm, argv[i], strlen (argv[i]), nocase,argv[i]);
    }

    /* Generate GtoTo Table and Fail Table */
    acsmCompile (acsm);

    /*Search Pattern*/
    while ( fgets(text,MAXLEN,fd) )
    {
        acsmSearch (acsm, text, strlen (text), PrintMatch);
        nline++;
    }

    PrintSummary(acsm->acsmPatterns);

    acsmFree (acsm);

    printf ("\n### AC Match Finished ###\n");
    system("pause");

    return (0);
}

acsm.h
/*
**   ACSMX.H
**
**
*/
#ifndef ACSMX_H
#define ACSMX_H

#include
#include
#include

/*
*   Prototypes
*/
#define ALPHABET_SIZE    256     
#define MAXLEN 256

#define ACSM_FAIL_STATE   -1     

typedef struct _acsm_pattern {      

    struct  _acsm_pattern *next;
    unsigned char         *patrn;
    unsigned char         *casepatrn;
    int      n;
    int      nocase;
    void   * id;
    int         nmatch;

} ACSM_PATTERN;


typedef struct  {    

    /* Next state - based on input character */
    int      NextState[ ALPHABET_SIZE ];  

    /* Failure state - used while building NFA & DFA  */
    int      FailState;   

    /* List of patterns that end here, if any */
    ACSM_PATTERN *MatchList;   

}ACSM_STATETABLE;


/*
* State machine Struct
*/
typedef struct {

    int acsmMaxStates;  
    int acsmNumStates;  

    ACSM_PATTERN    * acsmPatterns;
    ACSM_STATETABLE * acsmStateTable;

}ACSM_STRUCT;

/*
*   Prototypes
*/
ACSM_STRUCT * acsmNew ();
int acsmAddPattern( ACSM_STRUCT * p, unsigned char * pat, int n,int nocase);
int acsmCompile ( ACSM_STRUCT * acsm );
int acsmSearch ( ACSM_STRUCT * acsm,unsigned char * T, int n, int (*Match) (ACSM_PATTERN * mlist, int nline,int index));
void acsmFree ( ACSM_STRUCT * acsm );
void PrintMatch (ACSM_PATTERN * pattern,ACSM_PATTERN * mlist, int nline,int index) ;
void PrintSummary (ACSM_PATTERN * pattern) ;

#endif
acsm.c
#include
#include
#include
#include
#include "acsmx.h"

#define MEMASSERT(p,s) if(!p){fprintf(stderr,"ACSM-No Memory: %s!\n",s);exit(0);}

/*Define the number of the line,when match a keyword*/
extern int nline=1;

/*
* Malloc the AC Memory
*/
static void *AC_MALLOC (int n)
{
    void *p;
    p = malloc (n);

    return p;
}

/*
*Free the AC Memory
*/
static void AC_FREE (void *p)
{
    if (p)
        free (p);
}


/*
*    Simple QUEUE NODE
*/
typedef struct _qnode
{
    int state;
    struct _qnode *next;
}QNODE;

/*
*    Simple QUEUE Structure
*/
typedef struct _queue
{
    QNODE * head, *tail;
    int count;
}QUEUE;

/*
*Init the Queue
*/
static void queue_init (QUEUE * s)
{
    s->head = s->tail = 0;
    s->count = 0;
}


/*
*  Add Tail Item to queue
*/
static void queue_add (QUEUE * s, int state)
{
    QNODE * q;
    /*Queue is empty*/
    if (!s->head)
    {
        q = s->tail = s->head = (QNODE *) AC_MALLOC (sizeof (QNODE));
        /*if malloc failed,exit the problom*/
        MEMASSERT (q, "queue_add");
        q->state = state;
        q->next = 0; /*Set the New Node's Next Null*/
    }
    else
    {
        q = (QNODE *) AC_MALLOC (sizeof (QNODE));
        MEMASSERT (q, "queue_add");
        q->state = state;
        q->next = 0;
        /*Add the new Node into the queue*/
        s->tail->next = q;
        /*set the new node is the Queue's Tail*/
        s->tail = q;
    }
    s->count++;
}


/*
*  Remove Head Item from queue
*/
static int queue_remove (QUEUE * s)
{
    int state = 0;
    QNODE * q;
    /*Remove A QueueNode From the head of the Queue*/
    if (s->head)
    {
        q = s->head;
        state = q->state;
        s->head = s->head->next;
        s->count--;

        /*If Queue is Empty,After Remove A QueueNode*/
        if (!s->head)
        {
            s->tail = 0;
            s->count = 0;
        }
        /*Free the QueNode Memory*/
        AC_FREE (q);
    }
    return state;
}


/*
*Return The count of the Node in the Queue
*/
static int queue_count (QUEUE * s)
{
    return s->count;
}


/*
*Free the Queue Memory
*/
static void queue_free (QUEUE * s)
{
    while (queue_count (s))
    {
        queue_remove (s);
    }
}


/*
** Case Translation Table
*/
static unsigned char xlatcase[256];

/*
* Init the xlatcase Table,Trans alpha to UpperMode
* Just for the NoCase State
*/
static void init_xlatcase ()
{
    int i;
    for (i = 0; i < 256; i++)
    {
        xlatcase[i] = toupper (i);
    }
}

/*
*Convert the pattern string into upper
*/
static void ConvertCaseEx (unsigned char *d, unsigned char *s, int m)
{
    int i;
    for (i = 0; i < m; i++)
    {
        d[i] = xlatcase[s[i]];
    }
}

/*
*  Add a pattern to the list of patterns terminated at this state.
*  Insert at front of list.
*/
static void AddMatchListEntry (ACSM_STRUCT * acsm, int state, ACSM_PATTERN * px)
{
    ACSM_PATTERN * p;
    p = (ACSM_PATTERN *) AC_MALLOC (sizeof (ACSM_PATTERN));
    MEMASSERT (p, "AddMatchListEntry");
    memcpy (p, px, sizeof (ACSM_PATTERN));

    /*Add the new pattern to the pattern  list*/
    p->next = acsm->acsmStateTable[state].MatchList;
    acsm->acsmStateTable[state].MatchList = p;
}

/*
* Add Pattern States
*/
static void AddPatternStates (ACSM_STRUCT * acsm, ACSM_PATTERN * p)
{
    unsigned char *pattern;
    int state=0, next, n;
    n = p->n; /*The number of alpha in the pattern string*/
    pattern = p->patrn;

    /*
    *  Match up pattern with existing states
    */
    for (; n > 0; pattern++, n--)
    {
        next = acsm->acsmStateTable[state].NextState[*pattern];
        if (next == ACSM_FAIL_STATE)
            break;
        state = next;
    }

    /*
    *   Add new states for the rest of the pattern bytes, 1 state per byte
    */
    for (; n > 0; pattern++, n--)
    {
        acsm->acsmNumStates++;
        acsm->acsmStateTable[state].NextState[*pattern] = acsm->acsmNumStates;
        state = acsm->acsmNumStates;
    }
    /*Here,An accept state,just add into the MatchListof the state*/
    AddMatchListEntry (acsm, state, p);
}


/*
*   Build Non-Deterministic Finite Automata
*/
static void Build_NFA (ACSM_STRUCT * acsm)
{
    int r, s;
    int i;
    QUEUE q, *queue = &q;
    ACSM_PATTERN * mlist=0;
    ACSM_PATTERN * px=0;

    /* Init a Queue */
    queue_init (queue);

    /* Add the state 0 transitions 1st */
    /*1st depth Node's FailState is 0, fail(x)=0 */
    for (i = 0; i < ALPHABET_SIZE; i++)
    {
        s = acsm->acsmStateTable[0].NextState[i];
        if (s)
        {
            queue_add (queue, s);
            acsm->acsmStateTable[s].FailState = 0;
        }
    }

    /* Build the fail state transitions for each valid state */
    while (queue_count (queue) > 0)
    {
        r = queue_remove (queue);

        /* Find Final States for any Failure */
        for (i = 0; i < ALPHABET_SIZE; i++)
        {
            int fs, next;
            /*** Note NextState[i] is a const variable in this block ***/
            if ((s = acsm->acsmStateTable[r].NextState[i]) != ACSM_FAIL_STATE)
            {
                queue_add (queue, s);
                fs = acsm->acsmStateTable[r].FailState;

                /*
                *  Locate the next valid state for 'i' starting at s
                */
                /**** Note the  variable "next" ****/
                /*** Note "NextState[i]" is a const variable in this block ***/
                while ((next=acsm->acsmStateTable[fs].NextState[i]) ==
                    ACSM_FAIL_STATE)
                {
                    fs = acsm->acsmStateTable[fs].FailState;
                }

                /*
                *  Update 's' state failure state to point to the next valid state
                */
                acsm->acsmStateTable[s].FailState = next;
            }
            else
            {
                acsm->acsmStateTable[r].NextState[i] =
                    acsm->acsmStateTable[acsm->acsmStateTable[r].FailState].NextState[i];
            }
        }
    }

    /* Clean up the queue */
    queue_free (queue);
}


/*
*   Build Deterministic Finite Automata from NFA
*/
static void Convert_NFA_To_DFA (ACSM_STRUCT * acsm)
{
    int r, s;
    int i;
    QUEUE q, *queue = &q;

    /* Init a Queue */
    queue_init (queue);

    /* Add the state 0 transitions 1st */
    for (i = 0; i < ALPHABET_SIZE; i++)
    {
        s = acsm->acsmStateTable[0].NextState[i];
        if (s)
        {
            queue_add (queue, s);
        }
    }

    /* Start building the next layer of transitions */
    while (queue_count (queue) > 0)
    {
        r = queue_remove (queue);

        /* State is a branch state */
        for (i = 0; i < ALPHABET_SIZE; i++)
        {
            if ((s = acsm->acsmStateTable[r].NextState[i]) != ACSM_FAIL_STATE)
            {
                queue_add (queue, s);
            }
            else
            {
                acsm->acsmStateTable[r].NextState[i] =
                    acsm->acsmStateTable[acsm->acsmStateTable[r].FailState].NextState[i];
            }
        }
    }

    /* Clean up the queue */
    queue_free (queue);
}


/*
* Init the acsm DataStruct
*/
ACSM_STRUCT * acsmNew ()
{
    ACSM_STRUCT * p;
    init_xlatcase ();
    p = (ACSM_STRUCT *) AC_MALLOC (sizeof (ACSM_STRUCT));
    MEMASSERT (p, "acsmNew");
    if (p)
        memset (p, 0, sizeof (ACSM_STRUCT));
    return p;
}


/*
*   Add a pattern to the list of patterns for this state machine
*/
int acsmAddPattern (ACSM_STRUCT * p, unsigned char *pat, int n, int nocase)
{
    ACSM_PATTERN * plist;
    plist = (ACSM_PATTERN *) AC_MALLOC (sizeof (ACSM_PATTERN));
    MEMASSERT (plist, "acsmAddPattern");
    plist->patrn = (unsigned char *) AC_MALLOC (n+1);
    memset(plist->patrn+n,0,1);
    ConvertCaseEx (plist->patrn, pat, n);
    plist->casepatrn = (unsigned char *) AC_MALLOC (n+1);
    memset(plist->casepatrn+n,0,1);
    memcpy (plist->casepatrn, pat, n);
    plist->n = n;
    plist->nocase = nocase;
    plist->nmatch=0;

    /*Add the pattern into the pattern list*/
    plist->next = p->acsmPatterns;
    p->acsmPatterns = plist;

    return 0;
}

/*
*   Compile State Machine
*/
int acsmCompile (ACSM_STRUCT * acsm)
{
    int i, k;
    ACSM_PATTERN * plist;

    /* Count number of states */
    acsm->acsmMaxStates = 1; /*State 0*/
    for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)
    {
        acsm->acsmMaxStates += plist->n;
    }

    acsm->acsmStateTable = (ACSM_STATETABLE *) AC_MALLOC (sizeof (ACSM_STATETABLE) * acsm->acsmMaxStates);
    MEMASSERT (acsm->acsmStateTable, "acsmCompile");
    memset (acsm->acsmStateTable, 0,sizeof (ACSM_STATETABLE) * acsm->acsmMaxStates);

    /* Initialize state zero as a branch */
    acsm->acsmNumStates = 0;

    /* Initialize all States NextStates to FAILED */
    for (k = 0; k < acsm->acsmMaxStates; k++)
    {
        for (i = 0; i < ALPHABET_SIZE; i++)
        {
            acsm->acsmStateTable[k].NextState[i] = ACSM_FAIL_STATE;
        }
    }

    /* This is very import */
    /* Add each Pattern to the State Table */
    for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)
    {
        AddPatternStates (acsm, plist);
    }

    /* Set all failed state transitions which from state 0 to return to the 0'th state */
    for (i = 0; i < ALPHABET_SIZE; i++)
    {
        if (acsm->acsmStateTable[0].NextState[i] == ACSM_FAIL_STATE)
        {
            acsm->acsmStateTable[0].NextState[i] = 0;
        }
    }

    /* Build the NFA  */
    Build_NFA (acsm);

    /* Convert the NFA to a DFA */
    //Convert_NFA_To_DFA (acsm);

    return 0;
}


/*64KB Memory*/
static unsigned char Tc[64*1024];

/*
*   Search Text or Binary Data for Pattern matches
*/
int acsmSearch (ACSM_STRUCT * acsm, unsigned char *Tx, int n,void (*PrintMatch) (ACSM_PATTERN * mlist, int nline,int index))
{
    int state;
    ACSM_PATTERN * mlist;
    unsigned char *Tend;
    ACSM_STATETABLE * StateTable = acsm->acsmStateTable;
    int nfound = 0; /*Number of the found(matched) patten string*/
    unsigned char *T;
    int index;

    /* Case conversion */
    ConvertCaseEx (Tc, Tx, n);
    T = Tc;
    Tend = T + n;

    for (state = 0; T < Tend; T++)
    {
        state = StateTable[state].NextState[*T];

        /* State is a accept state? */
        if( StateTable[state].MatchList != NULL )
        {
            for( mlist=StateTable[state].MatchList; mlist!=NULL;
                mlist=mlist->next )
            {
                /*Get the index  of the Match Pattern String in  the Text*/
                index = T - mlist->n + 1 - Tc;

                //mlist->nmatch++;
                nfound++;
                PrintMatch (acsm->acsmPatterns,mlist, nline,index);
            }
        }
    }

    return nfound;
}


/*
*   Free all memory
*/
void acsmFree (ACSM_STRUCT * acsm)
{
    int i;
    ACSM_PATTERN * mlist, *ilist;
    for (i = 0; i < acsm->acsmMaxStates; i++)

    {
        if (acsm->acsmStateTable[i].MatchList != NULL)

        {
            mlist = acsm->acsmStateTable[i].MatchList;
            while (mlist)
            {
                ilist = mlist;
                mlist = mlist->next;
                AC_FREE (ilist);
            }
        }
    }
    AC_FREE (acsm->acsmStateTable);
}

/*
*   Print A Match String's Information
*/
void PrintMatch (ACSM_PATTERN * pattern,ACSM_PATTERN * mlist, int nline,int index)
{
    /* Count the Each Match Pattern */
    ACSM_PATTERN *temp = pattern;
    for (;temp!=NULL;temp=temp->next)
    {
        if (!strcmp(temp->patrn,mlist->patrn))
        {
            temp->nmatch++;
        }
        
    }
    
    if(mlist->nocase)
        fprintf (stdout, "Match KeyWord %s at %d line %d char\n", mlist->patrn,nline,index);
    else
        fprintf (stdout, "Match KeyWord %s at %d line %d char\n", mlist->casepatrn,nline,index);

}

/*
* Print Summary Information of the AC Match
*/
void PrintSummary (ACSM_PATTERN * pattern)
{    
    ACSM_PATTERN * mlist = pattern;
    printf("\n### Summary ###\n");
    for (;mlist!=NULL;mlist=mlist->next)
    {
        if(mlist->nocase)
            printf("%12s : %5d\n",mlist->patrn,mlist->nmatch);
        else
            printf("%12s : %5d\n",mlist->casepatrn,mlist->nmatch);
    }
}

将代码打了一个包,由于原来的一个语料库比较大,就没有打进去包里面。自己尝试一个文本吧。写了一个bat脚本。

感谢8楼网友提供的补丁来解决pattern串的重叠情况下的问题。
文件:AC.rar
大小:390KB[修正了pattern串重叠情况下的Bug]
下载:下载

网页,我是按照行来读的,有可能将一些没有在网页上面显示,但是在网页源代码中的字也匹配出来。
阅读(993) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~