Question:Calculate the Levenshtein distance (edit distance) of two files.
A commonly-used bottom-up dynamic programming algorithm for computing the Levenshtein distance involves the use of an (n + 1) × (m + 1) matrix, where n and m are the lengths of the two strings. Here is pseudocode for a function LevenshteinDistance that takes two strings, s of length m, and t of length n, and computes the Levenshtein distance between them:
int LevenshteinDistance(char s[1..m], char t[1..n])
// d is a table with m+1 rows and n+1 columns
declare int d[0..m, 0..n]
for i from 0 to m
d[i, 0] := i
for j from 1 to n
d[0, j] := j
for i from 1 to m
for j from 1 to n
if s[i] = t[j] then cost := 0
else cost := 1
d[i, j] := minimum(
d[i-1, j] + 1, // deletion
d[i, j-1] + 1, // insertion
d[i-1, j-1] + cost // substitution
)
return d[m, n]
Two examples of the resulting matrix (the minimum steps to be taken are highlighted):
The invariant maintained throughout the algorithm is that we can transform the initial segment
s[1..i] into t[1..j] using a minimum of d[i,j] operations. At the end, the bottom-right element of the array contains the answer.
Code:
#include
#include
#include
typedef unsigned int u_int;
u_int file_len(FILE *fp)
{
fseek(fp,0L,SEEK_END);
u_int len = ftell(fp);
printf("file has %d chars!\n",len);
return len;
/*fpos_t len;
fgetpos(fp,&len);
printf("file has %d chars!\n",len);
return len;
*/
}
u_int mininum(u_int a,u_int b)
{
if(a>b)
return b;
else
return a;
}
u_int LevenshteinDistance(FILE *fp_src,char *fp_des,u_int src_len,u_int des_len)
{
char *src,*des;
u_int edit_distance;
src = (char *)malloc((src_len)*sizeof(char));
des = (char *)malloc(des_len*sizeof(char));
fseek(fp_src,0L,SEEK_SET);
fseek(fp_des,0L,SEEK_SET);
fread(src,1,src_len,fp_src);
fread(des,1,des_len,fp_des);
//fgets(src,src_len+1,fp_src); //前面申请空间也要申请src_len+1
//fgets(des,des_len+1,fp_des);
/*fgets(char *s,int n, FILE *stream)只能读n-1个字符,自动加一个'\0',一起n个*/
//printf("%s\n",src);
//printf("%s\n",des);
u_int **dis;
u_int i,j;
dis = (u_int **)malloc((src_len+1)*sizeof(u_int *));
for(i=0;i<=src_len;i++)
{
dis[i] = (u_int *)malloc((des_len+1)*sizeof(u_int));
}
for(i=0;i<=src_len;i++)
dis[i][0]=i;
for(i=1;i<=des_len;i++)
dis[0][i]=i;
for(i=1;i<=src_len;i++)
{
for(j=1;j<=des_len;j++)
{
int cost;
if(src[i-1]==des[j-1])
cost = 0;
else
cost = 1;
dis[i][j] = mininum(mininum(dis[i-1][j]+1 , dis[i][j-1]+1),
dis[i-1][j-1]+cost);
}
}
edit_distance = dis[src_len][des_len];
free(src);
free(des);
for(i=0;i<=des_len;i++)
free(dis[i]);
free(dis);
return edit_distance;
}
int main(int argc,char **argv)
{
FILE *fp_src,*fp_des;
u_int src_len,des_len;
if(argc!=3)
{
printf("argvements error!\n");
exit(1);
}
if((fp_src = fopen(argv[1],"r"))==NULL)
{
printf("open %s failure!\n",argv[1]);
exit(1);
}
if((fp_des = fopen(argv[2],"r"))==NULL)
{
printf("open %s failure!\n",argv[2]);
exit(1);
}
src_len = file_len(fp_src);
des_len = file_len(fp_des);
u_int edit_distance = LevenshteinDistance(fp_src, fp_des, src_len, des_len);
fclose(fp_src);
fclose(fp_des);
printf("%s and %s edit distance is %d\n",argv[1],argv[2],edit_distance);
return 0;
}
阅读(4068) | 评论(0) | 转发(0) |