所用的算法是大家熟悉的KMP匹配算法,不过这个算法可以根据需要处理大文件字符串匹配,你也可以改变分组长度来加快匹配速度。
#include
#include
#include
#include
#include
#define MAXSIZE 1024
// 计算字符个数
long get_file_size(char *filename){
FILE *fp=fopen(filename,"r");
char ch;
long num=0;
while(ch=getc(fp)!=EOF)
num++;
fclose(fp);
return num;
}
void get_nextval(unsigned char pat[],int nextval[],int length)
{
int i=1;
int j=0;
nextval[1]=0;
while(i {
if(j==0||pat[i-1]==pat[j-1])
{
++i;
++j;
if(pat[i-1]!=pat[j-1]) nextval[i]=j;
else nextval[i]=nextval[j];
}
else j=nextval[j];
}
}
int Index_KMP(unsigned char text[], unsigned char pat[],int nextval[],int t_len,int p_len)
{
int i=0;
int j=0;
int count=0;
while (i<=t_len&&j<=p_len)
{
if(j==0||text[i-1]==pat[j-1]){++i;++j;}
else j=nextval[j];
if (j>p_len)
{
count++;
j=1;
}
}
if (count>0) return count;
else return -1;
}
int Serach(char *file1,char *file2)
{
int i=0;
int count=0;
int answer;
long ReadSize;
long tLen=0;//主串文件长度
long pLen=0;//模式串文件长度
int BlockNum=0;//分组大小
long tlastlen=0;//
unsigned char *tData;//为存放文件数据申请空间的起始位置
unsigned char *pData;//为存放文件数据申请空间的起始位置
int* nextval;
FILE* tFile=NULL;
FILE* pFile=NULL;
tLen=get_file_size(file1);
pLen=get_file_size(file2);
printf("主串长度为:%ld字节.\n",tLen);
printf("模式串长度为:%ld字节.\n",pLen);
if(tLen==0||tLen
return 0;
BlockNum=tLen/MAXSIZE;
tlastlen=tLen%MAXSIZE;
BlockNum=tlastlen?(BlockNum+1):BlockNum;
if((tFile= fopen(file1,"r"))==NULL)
{
printf("Open tFile Error!\n");//DEBUG
return 0;//错误处理机制
}
if((pFile= fopen(file2,"r"))==NULL)
{
printf("Open pFile Error!\n");//DEBUG
return 0;//错误处理机制
}
tData = (unsigned char*)malloc((unsigned int)(MAXSIZE+pLen-1+16)*sizeof(char));
if(!tData) return -1;
pData = (unsigned char*)malloc((unsigned int)(pLen+16)*sizeof(char));
if(!pData) return -1;
nextval = (int*)malloc((unsigned int)(pLen+16)*sizeof(int));
if(!nextval) return -1;
fread(pData, pLen, 1, pFile);
get_nextval(pData,nextval,pLen);
if(BlockNum==1)
{
ReadSize=(tlastlen)?tlastlen:BlockNum;
fread(tData, ReadSize, 1, tFile);
if((answer=Index_KMP(tData, pData,nextval,ReadSize,pLen))>=0)
count+=answer;
}
else
{
fread(tData, MAXSIZE, 1, tFile);
if((answer=Index_KMP(tData, pData,nextval,MAXSIZE,pLen))>=0)
count+=answer;
strncpy(tData,tData+MAXSIZE-pLen+1,pLen-1);
fread(tData+pLen-1, MAXSIZE, 1, tFile);
if((answer=Index_KMP(tData, pData,nextval,MAXSIZE+pLen-1,pLen))>=0)
count+=answer;
BlockNum=tlastlen?BlockNum:(BlockNum+1);
for(i=0;i {
strncpy(tData,tData+MAXSIZE,pLen-1);
fread(tData+pLen-1, MAXSIZE, 1, tFile);
if((answer=Index_KMP(tData, pData,nextval,MAXSIZE+pLen-1,pLen))>=0)
count+=answer;
}
if(tlastlen)
{
strncpy(tData,tData+MAXSIZE,pLen-1);
fread(tData+pLen-1, tlastlen, 1, tFile);
if((answer=Index_KMP(tData, pData,nextval,tlastlen+pLen-1,pLen))>=0)
count+=answer;
}
}
free(tData);
free(pData);
free(nextval);
fclose(tFile);
fclose(pFile);
return count;
}
void main()
{
clock_t start,end,total=0;
double a;
int all=0;
char *p="p.txt";
char *t="t.txt";
printf("\nKMP String Searching Program");
printf("\n====================================\n");
start=clock();
all=Serach(t,p);
end=clock();
total=end-start;
a=(double)total/(double)CLOCKS_PER_SEC;
printf("process time is %lf s\n",a);
if(total==-1)
{
printf("alloc error!\n");
}
printf("find the number is %d .\n",all);
}