Chinaunix首页 | 论坛 | 博客
  • 博客访问: 669322
  • 博文数量: 81
  • 博客积分: 1659
  • 博客等级: 上尉
  • 技术积分: 1286
  • 用 户 组: 普通用户
  • 注册时间: 2011-11-02 16:36
个人简介

专注于嵌入式和图像处理

文章分类

全部博文(81)

文章存档

2014年(1)

2013年(7)

2012年(46)

2011年(27)

分类: C/C++

2012-02-17 18:39:41

Question:Calculate the Levenshtein distance (edit distance) of two files.
A commonly-used bottom-up dynamic programming algorithm for computing the Levenshtein distance involves the use of an (n + 1) × (m + 1) matrix, where n and m are the lengths of the two strings. Here is pseudocode for a function LevenshteinDistance that takes two strings, s of length m, and t of length n, and computes the Levenshtein distance between them:
int LevenshteinDistance(char s[1..m], char t[1..n])
   // d is a table with m+1 rows and n+1 columns
   declare int d[0..m, 0..n]
   for i from 0 to m
       d[i, 0] := i
   for j from 1 to n
       d[0, j] := j
 for i from 1 to m
       for j from 1 to n
           if s[i] = t[j] then cost := 0
                          else cost := 1
           d[i, j] := minimum(
                                d[i-1, j] + 1,     // deletion
                                d[i, j-1] + 1,     // insertion
                                d[i-1, j-1] + cost   // substitution
                             )
   return d[m, n]
Two examples of the resulting matrix (the minimum steps to be taken are highlighted):             
The invariant maintained throughout the algorithm is that we can transform the initial segment
s[1..i] into t[1..j] using a minimum of d[i,j] operations. At the end, the bottom-right element of the array contains the answer.
 
Code:
#include
#include
#include
 
typedef  unsigned int u_int;
 
u_int file_len(FILE *fp)
{
 fseek(fp,0L,SEEK_END);
 u_int len = ftell(fp);
 printf("file has %d chars!\n",len);
 return len;
 /*fpos_t len;
 fgetpos(fp,&len);
 printf("file has %d chars!\n",len);
 return len;
 */ 
}
 
u_int mininum(u_int a,u_int b)
{
 if(a>b)
  return b;
 else
  return a;  
}
 
u_int LevenshteinDistance(FILE *fp_src,char *fp_des,u_int src_len,u_int des_len)
{
 char *src,*des;
 u_int edit_distance;
 src = (char *)malloc((src_len)*sizeof(char));
 des = (char *)malloc(des_len*sizeof(char));
 fseek(fp_src,0L,SEEK_SET);
 fseek(fp_des,0L,SEEK_SET);
 fread(src,1,src_len,fp_src);
 fread(des,1,des_len,fp_des);
 //fgets(src,src_len+1,fp_src); //前面申请空间也要申请src_len+1
 //fgets(des,des_len+1,fp_des);
 /*fgets(char *s,int n, FILE *stream)只能读n-1个字符,自动加一个'\0',一起n个*/
 //printf("%s\n",src);
 //printf("%s\n",des);
 u_int **dis;
 u_int i,j;
 dis = (u_int **)malloc((src_len+1)*sizeof(u_int *));
 for(i=0;i<=src_len;i++)
 {
  dis[i] = (u_int *)malloc((des_len+1)*sizeof(u_int));
 }
 for(i=0;i<=src_len;i++)
  dis[i][0]=i;
 for(i=1;i<=des_len;i++)
  dis[0][i]=i;
 
 for(i=1;i<=src_len;i++)
 {
  for(j=1;j<=des_len;j++)
  {
   int cost;
   if(src[i-1]==des[j-1])
    cost = 0;
   else
    cost = 1;
   dis[i][j] = mininum(mininum(dis[i-1][j]+1 , dis[i][j-1]+1),
         dis[i-1][j-1]+cost);
  }
  }
 edit_distance = dis[src_len][des_len];
 free(src);
 free(des);
 for(i=0;i<=des_len;i++)
  free(dis[i]);
 free(dis);
 return edit_distance;
}
 
int main(int argc,char **argv)
{
 FILE *fp_src,*fp_des;
 u_int src_len,des_len;
 if(argc!=3)
 {
  printf("argvements error!\n");
  exit(1);
 }
 if((fp_src = fopen(argv[1],"r"))==NULL)
 {
  printf("open %s failure!\n",argv[1]);
  exit(1);
 }
 if((fp_des = fopen(argv[2],"r"))==NULL)
 {
  printf("open %s failure!\n",argv[2]);
  exit(1);
 }
 src_len = file_len(fp_src);
 des_len = file_len(fp_des);
 u_int edit_distance = LevenshteinDistance(fp_src, fp_des, src_len, des_len);
 fclose(fp_src);
 fclose(fp_des);
 printf("%s and %s edit distance is %d\n",argv[1],argv[2],edit_distance);
 return 0;
}
阅读(4057) | 评论(0) | 转发(0) |
0

上一篇:Linux启动过程综述

下一篇:Kermit

给主人留下些什么吧!~~